[llvm] [AMDGPU] Factor out common code from SIInsertWaitcnts (PR #83018)
Christudasan Devadasan via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 26 07:44:07 PST 2024
https://github.com/cdevadas created https://github.com/llvm/llvm-project/pull/83018
SIInsertWaitcnts pass inserts various waitcounts required for operands of memory operations. For a subtarget, a new waitcount insertion should be attempted post Hazard Recognizer that comes later in the pipeline than where SIInsertWaitcnts is currently placed.
Factoring out the common code into Utils/AMDGPUWaitCountUtils so that most of the code can be used by the new waitcnt insertion pass as well.
>From bd4843eb882bfb680b946a1a80d7afa41f6b9711 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 26 Feb 2024 20:11:56 +0530
Subject: [PATCH] [AMDGPU] Factor out common code from SIInsertWaitcnts
SIInsertWaitcnts pass inserts various waitcounts required for
operands of memory operations. For a subtarget, a new waitcount
insertion should be attempted post Hazard Recognizer that comes
later in the pipeline than where SIInsertWaitcnts is currently
placed.
Factoring out the common code into Utils/AMDGPUWaitCountUtils
so that most of the code can be used by the new waitcnt
insertion pass as well.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2503 +++--------------
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 +-
.../AMDGPU/Utils/AMDGPUWaitCountUtils.cpp | 1393 +++++++++
.../AMDGPU/Utils/AMDGPUWaitCountUtils.h | 531 ++++
llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt | 1 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 10 +-
6 files changed, 2289 insertions(+), 2152 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp
create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a6184c5e1e0487..19d5ae17d3ec17 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -28,6 +28,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUWaitCountUtils.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
@@ -38,6 +39,7 @@
#include "llvm/Support/DebugCounter.h"
#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
+using namespace llvm::AMDGPU;
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -53,1540 +55,229 @@ static cl::opt<bool> ForceEmitZeroFlag(
cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
cl::init(false), cl::Hidden);
-namespace {
-// Class of object that encapsulates latest instruction counter score
-// associated with the operand. Used for determining whether
-// s_waitcnt instruction needs to be emitted.
-
-enum InstCounterType {
- LOAD_CNT = 0, // VMcnt prior to gfx12.
- DS_CNT, // LKGMcnt prior to gfx12.
- EXP_CNT, //
- STORE_CNT, // VScnt in gfx10/gfx11.
- NUM_NORMAL_INST_CNTS,
- SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
- BVH_CNT, // gfx12+ only.
- KM_CNT, // gfx12+ only.
- NUM_EXTENDED_INST_CNTS,
- NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
-};
-} // namespace
-
-namespace llvm {
-template <> struct enum_iteration_traits<InstCounterType> {
- static constexpr bool is_iterable = true;
-};
-} // namespace llvm
-
-namespace {
-// Return an iterator over all counters between LOAD_CNT (the first counter)
-// and \c MaxCounter (exclusive, default value yields an enumeration over
-// all counters).
-auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
- return enum_seq(LOAD_CNT, MaxCounter);
-}
-
-using RegInterval = std::pair<int, int>;
-
-struct HardwareLimits {
- unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
- unsigned ExpcntMax;
- unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
- unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
- unsigned SamplecntMax; // gfx12+ only.
- unsigned BvhcntMax; // gfx12+ only.
- unsigned KmcntMax; // gfx12+ only.
-};
-
-struct RegisterEncoding {
- unsigned VGPR0;
- unsigned VGPRL;
- unsigned SGPR0;
- unsigned SGPRL;
-};
-
-enum WaitEventType {
- VMEM_ACCESS, // vector-memory read & write
- VMEM_READ_ACCESS, // vector-memory read
- VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
- VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
- VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
- SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
- LDS_ACCESS, // lds read & write
- GDS_ACCESS, // gds read & write
- SQ_MESSAGE, // send message
- SMEM_ACCESS, // scalar-memory read & write
- EXP_GPR_LOCK, // export holding on its data src
- GDS_GPR_LOCK, // GDS holding on its data and addr src
- EXP_POS_ACCESS, // write to export position
- EXP_PARAM_ACCESS, // write to export parameter
- VMW_GPR_LOCK, // vector-memory write holding on its data src
- EXP_LDS_ACCESS, // read by ldsdir counting as export
- NUM_WAIT_EVENTS,
-};
-
-// The mapping is:
-// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
-// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
-// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
-// We reserve a fixed number of VGPR slots in the scoring tables for
-// special tokens like SCMEM_LDS (needed for buffer load to LDS).
-enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
- AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
- SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
- NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
- // Artificial register slots to track LDS writes into specific LDS locations
- // if a location is known. When slots are exhausted or location is
- // unknown use the first slot. The first slot is also always updated in
- // addition to known location's slot to properly generate waits if dependent
- // instruction's location is unknown.
- EXTRA_VGPR_LDS = 0,
- NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
-};
-
-// Enumerate different types of result-returning VMEM operations. Although
-// s_waitcnt orders them all with a single vmcnt counter, in the absence of
-// s_waitcnt only instructions of the same VmemType are guaranteed to write
-// their results in order -- so there is no need to insert an s_waitcnt between
-// two instructions of the same type that write the same vgpr.
-enum VmemType {
- // BUF instructions and MIMG instructions without a sampler.
- VMEM_NOSAMPLER,
- // MIMG instructions with a sampler.
- VMEM_SAMPLER,
- // BVH instructions
- VMEM_BVH,
- NUM_VMEM_TYPES
-};
-
-// Maps values of InstCounterType to the instruction that waits on that
-// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
-// returns true.
-static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
- AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
- AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
- AMDGPU::S_WAIT_KMCNT};
-
-static bool updateVMCntOnly(const MachineInstr &Inst) {
- return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
- SIInstrInfo::isFLATScratch(Inst);
-}
-
-#ifndef NDEBUG
-static bool isNormalMode(InstCounterType MaxCounter) {
- return MaxCounter == NUM_NORMAL_INST_CNTS;
-}
-#endif // NDEBUG
-
-VmemType getVmemType(const MachineInstr &Inst) {
- assert(updateVMCntOnly(Inst));
- if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
- !SIInstrInfo::isVSAMPLE(Inst))
- return VMEM_NOSAMPLER;
- const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
- const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
- AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
- return BaseInfo->BVH ? VMEM_BVH
- : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
-}
-
-unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
- switch (T) {
- case LOAD_CNT:
- return Wait.LoadCnt;
- case EXP_CNT:
- return Wait.ExpCnt;
- case DS_CNT:
- return Wait.DsCnt;
- case STORE_CNT:
- return Wait.StoreCnt;
- case SAMPLE_CNT:
- return Wait.SampleCnt;
- case BVH_CNT:
- return Wait.BvhCnt;
- case KM_CNT:
- return Wait.KmCnt;
- default:
- llvm_unreachable("bad InstCounterType");
- }
-}
-
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
- unsigned &WC = getCounterRef(Wait, T);
- WC = std::min(WC, Count);
-}
-
-void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
- getCounterRef(Wait, T) = ~0u;
-}
-
-unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
- return getCounterRef(Wait, T);
-}
-
-// Mapping from event to counter according to the table masks.
-InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
- for (auto T : inst_counter_types()) {
- if (masks[T] & (1 << E))
- return T;
- }
- llvm_unreachable("event type has no associated counter");
-}
+//===----------------------------------------------------------------------===//
+// SIWaitCntsInserter helper class interface.
+//===----------------------------------------------------------------------===//
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
+class SIWaitCntsInserter : public AMDGPUWaitCntInserter {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
- HardwareLimits Limits, RegisterEncoding Encoding,
- const unsigned *WaitEventMaskForInst,
- InstCounterType SmemAccessCounter)
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
- Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
- SmemAccessCounter(SmemAccessCounter) {}
-
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- default:
- break;
- }
- return 0;
- }
-
- unsigned getScoreLB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreLBs[T];
- }
-
- unsigned getScoreUB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreUBs[T];
- }
-
- unsigned getScoreRange(InstCounterType T) const {
- return getScoreUB(T) - getScoreLB(T);
- }
-
- unsigned getRegScore(int GprNo, InstCounterType T) const {
- if (GprNo < NUM_ALL_VGPRS) {
- return VgprScores[T][GprNo];
- }
- assert(T == SmemAccessCounter);
- return SgprScores[GprNo - NUM_ALL_VGPRS];
- }
-
- bool merge(const WaitcntBrackets &Other);
-
- RegInterval getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI, unsigned OpNo) const;
-
- bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
- void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
- void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
- void applyWaitcnt(InstCounterType T, unsigned Count);
- void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, WaitEventType E,
- MachineInstr &MI);
-
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
- }
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
- return HasPending;
- }
-
- bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
- // Return true if more than one bit is set in Events.
- return Events & (Events - 1);
- }
-
- bool hasPendingFlat() const {
- return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
- LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
- (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
- LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
- }
-
- void setPendingFlat() {
- LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
- LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
- }
-
- // Return true if there might be pending writes to the specified vgpr by VMEM
- // instructions with types different from V.
- bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
- assert(GprNo < NUM_ALL_VGPRS);
- return VgprVmemTypes[GprNo] & ~(1 << V);
- }
-
- void clearVgprVmemTypes(int GprNo) {
- assert(GprNo < NUM_ALL_VGPRS);
- VgprVmemTypes[GprNo] = 0;
- }
-
- void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
- }
-
- ArrayRef<const MachineInstr *> getLDSDMAStores() const {
- return LDSDMAStores;
+ SIWaitCntsInserter() {}
+ SIWaitCntsInserter(const GCNSubtarget *ST, const MachineRegisterInfo *MRI,
+ WaitCntGenerator *WCG, InstCounterType MC, bool FEZWC,
+ MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
+ AliasAnalysis *AA)
+ : AMDGPUWaitCntInserter(ST, MRI, WCG, MC), MLI(MLI), PDT(PDT), AA(AA),
+ ForceEmitZeroWaitcnts(FEZWC) {
+ for (auto T : inst_counter_types())
+ ForceEmitWaitcnt[T] = false;
}
-
- void print(raw_ostream &);
- void dump() { print(dbgs()); }
+ bool generateWaitcntInstBefore(MachineInstr &MI,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
+ VGPRInstsSet *VGPRInsts) override;
+ bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ VGPRInstsSet *VGPRInsts = nullptr) override;
+ void updateEventWaitcntAfter(MachineInstr &Inst,
+ WaitcntBrackets *ScoreBrackets) override;
private:
- struct MergeInfo {
- unsigned OldLB;
- unsigned OtherLB;
- unsigned MyShift;
- unsigned OtherShift;
- };
- static bool mergeScore(const MergeInfo &M, unsigned &Score,
- unsigned OtherScore);
-
- void setScoreLB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreLBs[T] = Val;
- }
-
- void setScoreUB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreUBs[T] = Val;
-
- if (T != EXP_CNT)
- return;
-
- if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
- }
-
- void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
- if (GprNo < NUM_ALL_VGPRS) {
- VgprUB = std::max(VgprUB, GprNo);
- VgprScores[T][GprNo] = Val;
- } else {
- assert(T == SmemAccessCounter);
- SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
- SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
- }
- }
-
- void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
- const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
- unsigned OpNo, unsigned Val);
-
- const GCNSubtarget *ST = nullptr;
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
- HardwareLimits Limits = {};
- RegisterEncoding Encoding = {};
- const unsigned *WaitEventMaskForInst;
- InstCounterType SmemAccessCounter;
- unsigned ScoreLBs[NUM_INST_CNTS] = {0};
- unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
- // Remember the last flat memory operation.
- unsigned LastFlat[NUM_INST_CNTS] = {0};
- // wait_cnt scores for every vgpr.
- // Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int VgprUB = -1;
- int SgprUB = -1;
- unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
- unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
- // Bitmask of the VmemTypes of VMEM instructions that might have a pending
- // write to each vgpr.
- unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
- // Store representative LDS DMA operations. The only useful info here is
- // alias info. One store is kept per unique AAInfo.
- SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
-};
-
-// This abstracts the logic for generating and updating S_WAIT* instructions
-// away from the analysis that determines where they are needed. This was
-// done because the set of counters and instructions for waiting on them
-// underwent a major shift with gfx12, sufficiently so that having this
-// abstraction allows the main analysis logic to be simpler than it would
-// otherwise have had to become.
-class WaitcntGenerator {
-protected:
- const GCNSubtarget *ST = nullptr;
- const SIInstrInfo *TII = nullptr;
- AMDGPU::IsaVersion IV;
- InstCounterType MaxCounter;
-
-public:
- WaitcntGenerator() {}
- WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
- : ST(ST), TII(ST->getInstrInfo()),
- IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
-
- // Edits an existing sequence of wait count instructions according
- // to an incoming Waitcnt value, which is itself updated to reflect
- // any new wait count instructions which may need to be generated by
- // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
- // were made.
- //
- // This editing will usually be merely updated operands, but it may also
- // delete instructions if the incoming Wait value indicates they are not
- // needed. It may also remove existing instructions for which a wait
- // is needed if it can be determined that it is better to generate new
- // instructions later, as can happen on gfx12.
- virtual bool
- applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
- MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
- MachineBasicBlock::instr_iterator It) const = 0;
-
- // Transform a soft waitcnt into a normal one.
- bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
-
- // Generates new wait count instructions according to the value of
- // Wait, returning true if any new instructions were created.
- virtual bool createNewWaitcnt(MachineBasicBlock &Block,
- MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) = 0;
-
- // Returns an array of bit masks which can be used to map values in
- // WaitEventType to corresponding counter values in InstCounterType.
- virtual const unsigned *getWaitEventMask() const = 0;
-
- // Returns a new waitcnt with all counters except VScnt set to 0. If
- // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
- virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
-
- virtual ~WaitcntGenerator() = default;
-};
-
-class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
-public:
- WaitcntGeneratorPreGFX12() {}
- WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
- : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
-
- bool
- applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
- MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
- MachineBasicBlock::instr_iterator It) const override;
-
- bool createNewWaitcnt(MachineBasicBlock &Block,
- MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
-
- const unsigned *getWaitEventMask() const override {
- assert(ST);
-
- static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
- (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
- (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
- (1 << SQ_MESSAGE),
- (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
- (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
- (1 << EXP_LDS_ACCESS),
- (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
- 0,
- 0,
- 0};
-
- return WaitEventMaskForInstPreGFX12;
- }
-
- virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
-};
-
-class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
-public:
- WaitcntGeneratorGFX12Plus() {}
- WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
- : WaitcntGenerator(ST, MaxCounter) {}
-
- bool
- applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
- MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
- MachineBasicBlock::instr_iterator It) const override;
-
- bool createNewWaitcnt(MachineBasicBlock &Block,
- MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
-
- const unsigned *getWaitEventMask() const override {
- assert(ST);
-
- static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
- (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
- (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
- (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
- (1 << EXP_LDS_ACCESS),
- (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
- (1 << VMEM_SAMPLER_READ_ACCESS),
- (1 << VMEM_BVH_READ_ACCESS),
- (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
-
- return WaitEventMaskForInstGFX12Plus;
- }
-
- virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
-};
-
-class SIInsertWaitcnts : public MachineFunctionPass {
-private:
- const GCNSubtarget *ST = nullptr;
- const SIInstrInfo *TII = nullptr;
- const SIRegisterInfo *TRI = nullptr;
- const MachineRegisterInfo *MRI = nullptr;
-
- DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
- DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
AliasAnalysis *AA = nullptr;
- struct BlockInfo {
- std::unique_ptr<WaitcntBrackets> Incoming;
- bool Dirty = true;
- };
-
- InstCounterType SmemAccessCounter;
+ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+ bool isPreheaderToFlush(MachineBasicBlock &MBB,
+ WaitcntBrackets &ScoreBrackets);
+ bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets) const;
+ WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const;
+ void setForceEmitWaitcnt();
- MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
+ DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+ DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
// because of amdgpu-waitcnt-forcezero flag
bool ForceEmitZeroWaitcnts;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
+};
- bool OptNone;
-
- // In any given run of this pass, WCG will point to one of these two
- // generator objects, which must have been re-initialised before use
- // from a value made using a subtarget constructor.
- WaitcntGeneratorPreGFX12 WCGPreGFX12;
- WaitcntGeneratorGFX12Plus WCGGFX12Plus;
-
- WaitcntGenerator *WCG = nullptr;
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either LDS or FLAT.
+bool SIWaitCntsInserter::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+ assert(TII->isFLAT(MI));
- // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
- // message.
- DenseSet<MachineInstr *> ReleaseVGPRInsts;
+ // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+ if (!TII->usesLGKM_CNT(MI))
+ return false;
- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+ // If in tgsplit mode then there can be no use of LDS.
+ if (ST->isTgSplitEnabled())
+ return false;
-public:
- static char ID;
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access LDS.
+ if (MI.memoperands_empty())
+ return true;
- SIInsertWaitcnts() : MachineFunctionPass(ID) {
- (void)ForceExpCounter;
- (void)ForceLgkmCounter;
- (void)ForceVMCounter;
+ // See if any memory operand specifies an address space that involves LDS.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+ return true;
}
- bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
- bool isPreheaderToFlush(MachineBasicBlock &MBB,
- WaitcntBrackets &ScoreBrackets);
- bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert wait instructions";
- }
+ return false;
+}
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<MachineLoopInfo>();
- AU.addRequired<MachinePostDominatorTree>();
- AU.addUsedIfAvailable<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
+// Return true if the given machine basic block is a preheader of a loop in
+// which we want to flush the vmcnt counter, and false otherwise.
+bool SIWaitCntsInserter::isPreheaderToFlush(MachineBasicBlock &MBB,
+ WaitcntBrackets &ScoreBrackets) {
+ auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
+ if (!IsInserted)
+ return Iterator->second;
- bool isForceEmitWaitcnt() const {
- for (auto T : inst_counter_types())
- if (ForceEmitWaitcnt[T])
- return true;
+ MachineBasicBlock *Succ = MBB.getSingleSuccessor();
+ if (!Succ)
return false;
- }
-
- void setForceEmitWaitcnt() {
-// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
-// For debug builds, get the debug counter info and adjust if need be
-#ifndef NDEBUG
- if (DebugCounter::isCounterSet(ForceExpCounter) &&
- DebugCounter::shouldExecute(ForceExpCounter)) {
- ForceEmitWaitcnt[EXP_CNT] = true;
- } else {
- ForceEmitWaitcnt[EXP_CNT] = false;
- }
-
- if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
- DebugCounter::shouldExecute(ForceLgkmCounter)) {
- ForceEmitWaitcnt[DS_CNT] = true;
- ForceEmitWaitcnt[KM_CNT] = true;
- } else {
- ForceEmitWaitcnt[DS_CNT] = false;
- ForceEmitWaitcnt[KM_CNT] = false;
- }
- if (DebugCounter::isCounterSet(ForceVMCounter) &&
- DebugCounter::shouldExecute(ForceVMCounter)) {
- ForceEmitWaitcnt[LOAD_CNT] = true;
- ForceEmitWaitcnt[SAMPLE_CNT] = true;
- ForceEmitWaitcnt[BVH_CNT] = true;
- } else {
- ForceEmitWaitcnt[LOAD_CNT] = false;
- ForceEmitWaitcnt[SAMPLE_CNT] = false;
- ForceEmitWaitcnt[BVH_CNT] = false;
- }
-#endif // NDEBUG
- }
-
- // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
- // FLAT instruction.
- WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
- // Maps VMEM access types to their corresponding WaitEventType.
- static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
- VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
-
- assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
- // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
- // these should use VM_CNT.
- if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
- return VMEM_ACCESS;
- if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
- // FLAT and SCRATCH instructions may access scratch. Other VMEM
- // instructions do not.
- if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
- return SCRATCH_WRITE_ACCESS;
- return VMEM_WRITE_ACCESS;
- }
- if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
- return VMEM_READ_ACCESS;
- return VmemReadMapping[getVmemType(Inst)];
- }
-
- bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
- bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
- bool generateWaitcntInstBefore(MachineInstr &MI,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt);
- bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
- bool generateWaitcnt(AMDGPU::Waitcnt Wait,
- MachineBasicBlock::instr_iterator It,
- MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
- void updateEventWaitcntAfter(MachineInstr &Inst,
- WaitcntBrackets *ScoreBrackets);
- bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets);
-};
+ MachineLoop *Loop = MLI->getLoopFor(Succ);
+ if (!Loop)
+ return false;
-} // end anonymous namespace
-
-RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
- unsigned OpNo) const {
- const MachineOperand &Op = MI->getOperand(OpNo);
- if (!TRI->isInAllocatableClass(Op.getReg()))
- return {-1, -1};
-
- // A use via a PW operand does not need a waitcnt.
- // A partial write is not a WAW.
- assert(!Op.getSubReg() || !Op.isUndef());
-
- RegInterval Result;
-
- unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
- AMDGPU::HWEncoding::REG_IDX_MASK;
-
- if (TRI->isVectorRegister(*MRI, Op.getReg())) {
- assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
- Result.first = Reg - Encoding.VGPR0;
- if (TRI->isAGPR(*MRI, Op.getReg()))
- Result.first += AGPR_OFFSET;
- assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
- } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
- Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
- assert(Result.first >= NUM_ALL_VGPRS &&
- Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+ if (Loop->getLoopPreheader() == &MBB &&
+ shouldFlushVmCnt(Loop, ScoreBrackets)) {
+ Iterator->second = true;
+ return true;
}
- // TODO: Handle TTMP
- // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
- else
- return {-1, -1};
-
- const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
- unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.second = Result.first + ((Size + 16) / 32);
- return Result;
-}
-
-void WaitcntBrackets::setExpScore(const MachineInstr *MI,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, unsigned OpNo,
- unsigned Val) {
- RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
- assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- setRegScore(RegNo, EXP_CNT, Val);
- }
+ return false;
}
-void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(WaitEventMaskForInst, E);
-
- unsigned UB = getScoreUB(T);
- unsigned CurrScore = UB + 1;
- if (CurrScore == 0)
- report_fatal_error("InsertWaitcnt score wraparound");
- // PendingEvents and ScoreUB need to be update regardless if this event
- // changes the score of a register or not.
- // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- PendingEvents |= 1 << E;
- setScoreUB(T, CurrScore);
-
- if (T == EXP_CNT) {
- // Put score on the source vgprs. If this is a store, just use those
- // specific register(s).
- if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
- int AddrOpIdx =
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
- // All GDS operations must protect their address register (same as
- // export.)
- if (AddrOpIdx != -1) {
- setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
- }
+// Return true if it is better to flush the vmcnt counter in the preheader of
+// the given loop. We currently decide to flush in two situations:
+// 1. The loop contains vmem store(s), no vmem load and at least one use of a
+// vgpr containing a value that is loaded outside of the loop. (Only on
+// targets with no vscnt counter).
+// 2. The loop contains vmem load(s), but the loaded values are not used in the
+// loop, and at least one use of a vgpr containing a value that is loaded
+// outside of the loop.
+bool SIWaitCntsInserter::shouldFlushVmCnt(MachineLoop *ML,
+ WaitcntBrackets &Brackets) const {
+ bool HasVMemLoad = false;
+ bool HasVMemStore = false;
+ bool UsesVgprLoadedOutside = false;
+ DenseSet<Register> VgprUse;
+ DenseSet<Register> VgprDef;
- if (Inst.mayStore()) {
- if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
- CurrScore);
- }
- if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
- setExpScore(&Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
- AMDGPU::OpName::data1),
- CurrScore);
- }
- } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
- Inst.getOpcode() != AMDGPU::DS_APPEND &&
- Inst.getOpcode() != AMDGPU::DS_CONSUME &&
- Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
- for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
- const MachineOperand &Op = Inst.getOperand(I);
- if (Op.isReg() && !Op.isDef() &&
- TRI->isVectorRegister(*MRI, Op.getReg())) {
- setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
- }
- }
- }
- } else if (TII->isFLAT(Inst)) {
- if (Inst.mayStore()) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
- CurrScore);
- } else if (SIInstrInfo::isAtomicRet(Inst)) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
- CurrScore);
- }
- } else if (TII->isMIMG(Inst)) {
- if (Inst.mayStore()) {
- setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- } else if (SIInstrInfo::isAtomicRet(Inst)) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
- CurrScore);
- }
- } else if (TII->isMTBUF(Inst)) {
- if (Inst.mayStore()) {
- setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- }
- } else if (TII->isMUBUF(Inst)) {
- if (Inst.mayStore()) {
- setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- } else if (SIInstrInfo::isAtomicRet(Inst)) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
- CurrScore);
- }
- } else if (TII->isLDSDIR(Inst)) {
- // LDSDIR instructions attach the score to the destination.
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
- CurrScore);
- } else {
- if (TII->isEXP(Inst)) {
- // For export the destination registers are really temps that
- // can be used as the actual source after export patching, so
- // we need to treat them like sources and set the EXP_CNT
- // score.
- for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
- MachineOperand &DefMO = Inst.getOperand(I);
- if (DefMO.isReg() && DefMO.isDef() &&
- TRI->isVGPR(*MRI, DefMO.getReg())) {
- setRegScore(
- TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
- EXP_CNT, CurrScore);
- }
- }
- }
- for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
- MachineOperand &MO = Inst.getOperand(I);
- if (MO.isReg() && !MO.isDef() &&
- TRI->isVectorRegister(*MRI, MO.getReg())) {
- setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
- }
- }
- }
-#if 0 // TODO: check if this is handled by MUBUF code above.
- } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
- Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
- Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
- MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
- unsigned OpNo;//TODO: find the OpNo for this operand;
- RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
- for (int RegNo = Interval.first; RegNo < Interval.second;
- ++RegNo) {
- setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
- }
-#endif
- } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
- // Match the score to the destination registers.
- for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
- auto &Op = Inst.getOperand(I);
- if (!Op.isReg() || !Op.isDef())
- continue;
- RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
- if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
- if (Interval.first >= NUM_ALL_VGPRS)
- continue;
- if (updateVMCntOnly(Inst)) {
- // updateVMCntOnly should only leave us with VGPRs
- // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
- // defs. That's required for a sane index into `VgprMemTypes` below
- assert(TRI->isVectorRegister(*MRI, Op.getReg()));
- VmemType V = getVmemType(Inst);
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
- VgprVmemTypes[RegNo] |= 1 << V;
- }
- }
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- setRegScore(RegNo, T, CurrScore);
+ for (MachineBasicBlock *MBB : ML->blocks()) {
+ for (MachineInstr &MI : *MBB) {
+ if (isVMEMOrFlatVMEM(MI)) {
+ if (MI.mayLoad())
+ HasVMemLoad = true;
+ if (MI.mayStore())
+ HasVMemStore = true;
}
- }
- if (Inst.mayStore() &&
- (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
- // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
- // written can be accessed. A load from LDS to VMEM does not need a wait.
- unsigned Slot = 0;
- for (const auto *MemOp : Inst.memoperands()) {
- if (!MemOp->isStore() ||
- MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ for (unsigned I = 0; I < MI.getNumOperands(); I++) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
- // Comparing just AA info does not guarantee memoperands are equal
- // in general, but this is so for LDS DMA in practice.
- auto AAI = MemOp->getAAInfo();
- // Alias scope information gives a way to definitely identify an
- // original memory object and practically produced in the module LDS
- // lowering pass. If there is no scope available we will not be able
- // to disambiguate LDS aliasing as after the module lowering all LDS
- // is squashed into a single big object. Do not attempt to use one of
- // the limited LDSDMAStores for something we will not be able to use
- // anyway.
- if (!AAI || !AAI.Scope)
- break;
- for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
- for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
- if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
- Slot = I + 1;
+ auto [RegLow, RegHigh] = Brackets.getRegInterval(&MI, MRI, TRI, I);
+ // Vgpr use
+ if (Op.isUse()) {
+ for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+ // If we find a register that is loaded inside the loop, 1. and 2.
+ // are invalidated and we can exit.
+ if (VgprDef.contains(RegNo))
+ return false;
+ VgprUse.insert(RegNo);
+ // If at least one of Op's registers is in the score brackets, the
+ // value is likely loaded outside of the loop.
+ if (Brackets.getRegScore(RegNo, LOAD_CNT) >
+ Brackets.getScoreLB(LOAD_CNT) ||
+ Brackets.getRegScore(RegNo, SAMPLE_CNT) >
+ Brackets.getScoreLB(SAMPLE_CNT) ||
+ Brackets.getRegScore(RegNo, BVH_CNT) >
+ Brackets.getScoreLB(BVH_CNT)) {
+ UsesVgprLoadedOutside = true;
break;
}
}
- }
- if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
- break;
- LDSDMAStores.push_back(&Inst);
- Slot = LDSDMAStores.size();
- break;
- }
- setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
- if (Slot)
- setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
- }
- }
-}
-
-void WaitcntBrackets::print(raw_ostream &OS) {
- OS << '\n';
- for (auto T : inst_counter_types(MaxCounter)) {
- unsigned SR = getScoreRange(T);
-
- switch (T) {
- case LOAD_CNT:
- OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
- << SR << "): ";
- break;
- case DS_CNT:
- OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
- << SR << "): ";
- break;
- case EXP_CNT:
- OS << " EXP_CNT(" << SR << "): ";
- break;
- case STORE_CNT:
- OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
- << SR << "): ";
- break;
- case SAMPLE_CNT:
- OS << " SAMPLE_CNT(" << SR << "): ";
- break;
- case BVH_CNT:
- OS << " BVH_CNT(" << SR << "): ";
- break;
- case KM_CNT:
- OS << " KM_CNT(" << SR << "): ";
- break;
- default:
- OS << " UNKNOWN(" << SR << "): ";
- break;
- }
-
- if (SR != 0) {
- // Print vgpr scores.
- unsigned LB = getScoreLB(T);
-
- for (int J = 0; J <= VgprUB; J++) {
- unsigned RegScore = getRegScore(J, T);
- if (RegScore <= LB)
- continue;
- unsigned RelScore = RegScore - LB - 1;
- if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
- OS << RelScore << ":v" << J << " ";
- } else {
- OS << RelScore << ":ds ";
- }
- }
- // Also need to print sgpr scores for lgkm_cnt.
- if (T == SmemAccessCounter) {
- for (int J = 0; J <= SgprUB; J++) {
- unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
- if (RegScore <= LB)
- continue;
- unsigned RelScore = RegScore - LB - 1;
- OS << RelScore << ":s" << J << " ";
- }
- }
- }
- OS << '\n';
- }
- OS << '\n';
-}
-
-/// Simplify the waitcnt, in the sense of removing redundant counts, and return
-/// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
- simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
- simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
- simplifyWaitcnt(DS_CNT, Wait.DsCnt);
- simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
- simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
- simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
- simplifyWaitcnt(KM_CNT, Wait.KmCnt);
-}
-
-void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
- unsigned &Count) const {
- // The number of outstanding events for this type, T, can be calculated
- // as (UB - LB). If the current Count is greater than or equal to the number
- // of outstanding events, then the wait for this counter is redundant.
- if (Count >= getScoreRange(T))
- Count = ~0u;
-}
-
-void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
- AMDGPU::Waitcnt &Wait) const {
- unsigned ScoreToWait = getRegScore(RegNo, T);
-
- // If the score of src_operand falls within the bracket, we need an
- // s_waitcnt instruction.
- const unsigned LB = getScoreLB(T);
- const unsigned UB = getScoreUB(T);
- if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
- if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
- !ST->hasFlatLgkmVMemCountInOrder()) {
- // If there is a pending FLAT operation, and this is a VMem or LGKM
- // waitcnt and the target can report early completion, then we need
- // to force a waitcnt 0.
- addWait(Wait, T, 0);
- } else if (counterOutOfOrder(T)) {
- // Counter can get decremented out-of-order when there
- // are multiple types event in the bracket. Also emit an s_wait counter
- // with a conservative value of 0 for the counter.
- addWait(Wait, T, 0);
- } else {
- // If a counter has been maxed out avoid overflow by waiting for
- // MAX(CounterType) - 1 instead.
- unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
- addWait(Wait, T, NeededWait);
- }
- }
-}
-
-void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
- applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
- applyWaitcnt(EXP_CNT, Wait.ExpCnt);
- applyWaitcnt(DS_CNT, Wait.DsCnt);
- applyWaitcnt(STORE_CNT, Wait.StoreCnt);
- applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
- applyWaitcnt(BVH_CNT, Wait.BvhCnt);
- applyWaitcnt(KM_CNT, Wait.KmCnt);
-}
-
-void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
- const unsigned UB = getScoreUB(T);
- if (Count >= UB)
- return;
- if (Count != 0) {
- if (counterOutOfOrder(T))
- return;
- setScoreLB(T, std::max(getScoreLB(T), UB - Count));
- } else {
- setScoreLB(T, UB);
- PendingEvents &= ~WaitEventMaskForInst[T];
- }
-}
-
-// Where there are multiple types of event in the bracket of a counter,
-// the decrement may go out of order.
-bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
- // Scalar memory read always can go out of order.
- if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
- return true;
- return hasMixedPendingEvents(T);
-}
-
-INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
- false)
-
-char SIInsertWaitcnts::ID = 0;
-
-char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
-
-FunctionPass *llvm::createSIInsertWaitcntsPass() {
- return new SIInsertWaitcnts();
-}
-
-static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
- unsigned NewEnc) {
- int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
- assert(OpIdx >= 0);
-
- MachineOperand &MO = MI.getOperand(OpIdx);
-
- if (NewEnc == MO.getImm())
- return false;
-
- MO.setImm(NewEnc);
- return true;
-}
-
-/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
-/// and if so, which counter it is waiting on.
-static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
- switch (Opcode) {
- case AMDGPU::S_WAIT_LOADCNT:
- return LOAD_CNT;
- case AMDGPU::S_WAIT_EXPCNT:
- return EXP_CNT;
- case AMDGPU::S_WAIT_STORECNT:
- return STORE_CNT;
- case AMDGPU::S_WAIT_SAMPLECNT:
- return SAMPLE_CNT;
- case AMDGPU::S_WAIT_BVHCNT:
- return BVH_CNT;
- case AMDGPU::S_WAIT_DSCNT:
- return DS_CNT;
- case AMDGPU::S_WAIT_KMCNT:
- return KM_CNT;
- default:
- return {};
- }
-}
-
-bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
- unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
- if (Opcode == Waitcnt->getOpcode())
- return false;
-
- Waitcnt->setDesc(TII->get(Opcode));
- return true;
-}
-
-/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
-/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
-/// from \p Wait that were added by previous passes. Currently this pass
-/// conservatively assumes that these preexisting waits are required for
-/// correctness.
-bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
- WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
- assert(ST);
- assert(isNormalMode(MaxCounter));
-
- bool Modified = false;
- MachineInstr *WaitcntInstr = nullptr;
- MachineInstr *WaitcntVsCntInstr = nullptr;
-
- for (auto &II :
- make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
- if (II.isMetaInstruction())
- continue;
-
- unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
- bool IsSoft = Opcode != II.getOpcode();
-
- // Update required wait count. If this is a soft waitcnt (= it was added
- // by an earlier pass), it may be entirely removed.
- if (Opcode == AMDGPU::S_WAITCNT) {
- unsigned IEnc = II.getOperand(0).getImm();
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- if (IsSoft)
- ScoreBrackets.simplifyWaitcnt(OldWait);
- Wait = Wait.combined(OldWait);
-
- // Merge consecutive waitcnt of the same type by erasing multiples.
- if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
- II.eraseFromParent();
- Modified = true;
- } else
- WaitcntInstr = &II;
- } else {
- assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
- assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-
- unsigned OldVSCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- if (IsSoft)
- ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
- Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
-
- if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
- II.eraseFromParent();
- Modified = true;
- } else
- WaitcntVsCntInstr = &II;
- }
- }
-
- if (WaitcntInstr) {
- Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
- AMDGPU::encodeWaitcnt(IV, Wait));
- Modified |= promoteSoftWaitCnt(WaitcntInstr);
-
- ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
- ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
- ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
- Wait.LoadCnt = ~0u;
- Wait.ExpCnt = ~0u;
- Wait.DsCnt = ~0u;
-
- LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
- ? dbgs()
- << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: " << *WaitcntInstr << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntInstr << '\n');
- }
-
- if (WaitcntVsCntInstr) {
- Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
- AMDGPU::OpName::simm16, Wait.StoreCnt);
- Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
-
- ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
- Wait.StoreCnt = ~0u;
-
- LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: " << *WaitcntVsCntInstr
- << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntVsCntInstr << '\n');
- }
-
- return Modified;
-}
-
-/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
-/// required counters in \p Wait
-bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
- MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
- assert(ST);
- assert(isNormalMode(MaxCounter));
-
- bool Modified = false;
- const DebugLoc &DL = Block.findDebugLoc(It);
-
- // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
- // single instruction while VScnt has its own instruction.
- if (Wait.hasWaitExceptStoreCnt()) {
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Modified = true;
-
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
-
- if (Wait.hasWaitStoreCnt()) {
- assert(ST->hasVscnt());
-
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.StoreCnt);
- Modified = true;
-
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
-
- return Modified;
-}
-
-AMDGPU::Waitcnt
-WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
- return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
-}
-
-AMDGPU::Waitcnt
-WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
- return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
-}
-
-/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
-/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
-/// were added by previous passes. Currently this pass conservatively
-/// assumes that these preexisting waits are required for correctness.
-bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
- WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
- assert(ST);
- assert(!isNormalMode(MaxCounter));
-
- bool Modified = false;
- MachineInstr *CombinedLoadDsCntInstr = nullptr;
- MachineInstr *CombinedStoreDsCntInstr = nullptr;
- MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
-
- for (auto &II :
- make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
- if (II.isMetaInstruction())
- continue;
-
- MachineInstr **UpdatableInstr;
-
- // Update required wait count. If this is a soft waitcnt (= it was added
- // by an earlier pass), it may be entirely removed.
-
- unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
- bool IsSoft = Opcode != II.getOpcode();
-
- if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
- unsigned OldEnc =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
- if (IsSoft)
- ScoreBrackets.simplifyWaitcnt(OldWait);
- Wait = Wait.combined(OldWait);
- UpdatableInstr = &CombinedLoadDsCntInstr;
- } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
- unsigned OldEnc =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
- if (IsSoft)
- ScoreBrackets.simplifyWaitcnt(OldWait);
- Wait = Wait.combined(OldWait);
- UpdatableInstr = &CombinedStoreDsCntInstr;
- } else {
- std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
- assert(CT.has_value());
- unsigned OldCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- if (IsSoft)
- ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
- addWait(Wait, CT.value(), OldCnt);
- UpdatableInstr = &WaitInstrs[CT.value()];
- }
-
- // Merge consecutive waitcnt of the same type by erasing multiples.
- if (!*UpdatableInstr) {
- *UpdatableInstr = &II;
- } else {
- II.eraseFromParent();
- Modified = true;
- }
- }
-
- if (CombinedLoadDsCntInstr) {
- // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
- // to be waited for. Otherwise, let the instruction be deleted so
- // the appropriate single counter wait instruction can be inserted
- // instead, when new S_WAIT_*CNT instructions are inserted by
- // createNewWaitcnt(). As a side effect, resetting the wait counts will
- // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
- // the loop below that deals with single counter instructions.
- if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
- unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
- Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
- AMDGPU::OpName::simm16, NewEnc);
- Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
- ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
- ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
- Wait.LoadCnt = ~0u;
- Wait.DsCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: "
- << *CombinedLoadDsCntInstr << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It << "New Instr: "
- << *CombinedLoadDsCntInstr << '\n');
- } else {
- CombinedLoadDsCntInstr->eraseFromParent();
- Modified = true;
- }
- }
-
- if (CombinedStoreDsCntInstr) {
- // Similarly for S_WAIT_STORECNT_DSCNT.
- if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
- unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
- Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
- AMDGPU::OpName::simm16, NewEnc);
- Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
- ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
- ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
- Wait.StoreCnt = ~0u;
- Wait.DsCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: "
- << *CombinedStoreDsCntInstr << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It << "New Instr: "
- << *CombinedStoreDsCntInstr << '\n');
- } else {
- CombinedStoreDsCntInstr->eraseFromParent();
- Modified = true;
- }
- }
-
- // Look for an opportunity to convert existing S_WAIT_LOADCNT,
- // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
- // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
- // instructions so that createNewWaitcnt() will create new combined
- // instructions to replace them.
-
- if (Wait.DsCnt != ~0u) {
- // This is a vector of addresses in WaitInstrs pointing to instructions
- // that should be removed if they are present.
- SmallVector<MachineInstr **, 2> WaitsToErase;
-
- // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
- // both) need to be waited for, ensure that there are no existing
- // individual wait count instructions for these.
-
- if (Wait.LoadCnt != ~0u) {
- WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
- WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
- } else if (Wait.StoreCnt != ~0u) {
- WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
- WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
- }
-
- for (MachineInstr **WI : WaitsToErase) {
- if (!*WI)
- continue;
-
- (*WI)->eraseFromParent();
- *WI = nullptr;
- Modified = true;
- }
- }
-
- for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- if (!WaitInstrs[CT])
- continue;
-
- unsigned NewCnt = getWait(Wait, CT);
- if (NewCnt != ~0u) {
- Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
- AMDGPU::OpName::simm16, NewCnt);
- Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
-
- ScoreBrackets.applyWaitcnt(CT, NewCnt);
- setNoWait(Wait, CT);
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: " << *WaitInstrs[CT]
- << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitInstrs[CT] << '\n');
- } else {
- WaitInstrs[CT]->eraseFromParent();
- Modified = true;
+ }
+ // VMem load vgpr def
+ else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
+ for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+ // If we find a register that is loaded inside the loop, 1. and 2.
+ // are invalidated and we can exit.
+ if (VgprUse.contains(RegNo))
+ return false;
+ VgprDef.insert(RegNo);
+ }
+ }
}
}
-
- return Modified;
+ if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
+ return true;
+ return HasVMemLoad && UsesVgprLoadedOutside;
}
-/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
-bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
- MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
- assert(ST);
- assert(!isNormalMode(MaxCounter));
-
- bool Modified = false;
- const DebugLoc &DL = Block.findDebugLoc(It);
-
- // Check for opportunities to use combined wait instructions.
- if (Wait.DsCnt != ~0u) {
- MachineInstr *SWaitInst = nullptr;
-
- if (Wait.LoadCnt != ~0u) {
- unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
-
- SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
- .addImm(Enc);
-
- Wait.LoadCnt = ~0u;
- Wait.DsCnt = ~0u;
- } else if (Wait.StoreCnt != ~0u) {
- unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
-
- SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
- .addImm(Enc);
-
- Wait.StoreCnt = ~0u;
- Wait.DsCnt = ~0u;
- }
-
- if (SWaitInst) {
- Modified = true;
-
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
+// Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
+// FLAT instruction.
+WaitEventType
+SIWaitCntsInserter::getVmemWaitEventType(const MachineInstr &Inst) const {
+ // Maps VMEM access types to their corresponding WaitEventType.
+ static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
+ VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+
+ assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
+ // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+ // these should use VM_CNT.
+ if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+ return VMEM_ACCESS;
+ if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
+ // FLAT and SCRATCH instructions may access scratch. Other VMEM
+ // instructions do not.
+ if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+ return SCRATCH_WRITE_ACCESS;
+ return VMEM_WRITE_ACCESS;
+ }
+ if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+ return VMEM_READ_ACCESS;
+ return VmemReadMapping[getVmemType(Inst)];
+}
+
+void SIWaitCntsInserter::setForceEmitWaitcnt() {
+// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
+// For debug builds, get the debug counter info and adjust if need be
+#ifndef NDEBUG
+ if (DebugCounter::isCounterSet(ForceExpCounter) &&
+ DebugCounter::shouldExecute(ForceExpCounter)) {
+ ForceEmitWaitcnt[EXP_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[EXP_CNT] = false;
}
- // Generate an instruction for any remaining counter that needs
- // waiting for.
-
- for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- unsigned Count = getWait(Wait, CT);
- if (Count == ~0u)
- continue;
-
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
- .addImm(Count);
-
- Modified = true;
-
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
+ DebugCounter::shouldExecute(ForceLgkmCounter)) {
+ ForceEmitWaitcnt[DS_CNT] = true;
+ ForceEmitWaitcnt[KM_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[DS_CNT] = false;
+ ForceEmitWaitcnt[KM_CNT] = false;
}
- return Modified;
-}
-
-static bool readsVCCZ(const MachineInstr &MI) {
- unsigned Opc = MI.getOpcode();
- return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
- !MI.getOperand(1).isUndef();
-}
-
-/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
-static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
- // Currently all conventions wait, but this may not always be the case.
- //
- // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
- // senses to omit the wait and do it in the caller.
- return true;
-}
-
-/// \returns true if the callee is expected to wait for any outstanding waits
-/// before returning.
-static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
- return true;
+ if (DebugCounter::isCounterSet(ForceVMCounter) &&
+ DebugCounter::shouldExecute(ForceVMCounter)) {
+ ForceEmitWaitcnt[LOAD_CNT] = true;
+ ForceEmitWaitcnt[SAMPLE_CNT] = true;
+ ForceEmitWaitcnt[BVH_CNT] = true;
+ } else {
+ ForceEmitWaitcnt[LOAD_CNT] = false;
+ ForceEmitWaitcnt[SAMPLE_CNT] = false;
+ ForceEmitWaitcnt[BVH_CNT] = false;
+ }
+#endif // NDEBUG
}
/// Generate s_waitcnt instruction to be placed before cur_Inst.
@@ -1601,10 +292,9 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
/// scores (*_score_LB and *_score_ub respectively).
/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
/// flush the vmcnt counter here.
-bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt) {
+bool SIWaitCntsInserter::generateWaitcntInstBefore(
+ MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr, bool FlushVmCnt, VGPRInstsSet *VGPRInsts) {
setForceEmitWaitcnt();
if (MI.isMetaInstruction())
@@ -1624,6 +314,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.LoadCnt = 0;
}
+ MachineFunction *MF = MI.getParent()->getParent();
+ bool OptNone = MF->getFunction().hasOptNone() ||
+ MF->getTarget().getOptLevel() == CodeGenOptLevel::None;
+ InstCounterType SmemAccessCounter =
+ eventCounter(WCG->getWaitEventMask(), SMEM_ACCESS);
+
// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
@@ -1644,7 +340,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
- ReleaseVGPRInsts.insert(&MI);
+ VGPRInsts->insert(&MI);
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -1734,21 +430,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
if (MI.getOperand(CallAddrOpIdx).isReg()) {
- RegInterval CallAddrOpInterval =
+ auto [CallAddrOpLow, CallAddrOpHigh] =
ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
- for (int RegNo = CallAddrOpInterval.first;
- RegNo < CallAddrOpInterval.second; ++RegNo)
+ for (int RegNo = CallAddrOpLow; RegNo < CallAddrOpHigh; ++RegNo)
ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
int RtnAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
if (RtnAddrOpIdx != -1) {
- RegInterval RtnAddrOpInterval =
+ auto [RtnAddrOpLow, RtnAddrOpHigh] =
ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
- for (int RegNo = RtnAddrOpInterval.first;
- RegNo < RtnAddrOpInterval.second; ++RegNo)
+ for (int RegNo = RtnAddrOpLow; RegNo < RtnAddrOpHigh; ++RegNo)
ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
}
}
@@ -1816,10 +510,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
continue;
- RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
+ auto [RegLow, RegHigh] = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
@@ -1867,361 +561,39 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
- Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
-
- if (ForceEmitWaitcnt[LOAD_CNT])
- Wait.LoadCnt = 0;
- if (ForceEmitWaitcnt[EXP_CNT])
- Wait.ExpCnt = 0;
- if (ForceEmitWaitcnt[DS_CNT])
- Wait.DsCnt = 0;
- if (ForceEmitWaitcnt[SAMPLE_CNT])
- Wait.SampleCnt = 0;
- if (ForceEmitWaitcnt[BVH_CNT])
- Wait.BvhCnt = 0;
- if (ForceEmitWaitcnt[KM_CNT])
- Wait.KmCnt = 0;
-
- if (FlushVmCnt) {
- if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
- Wait.LoadCnt = 0;
- if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
- Wait.SampleCnt = 0;
- if (ScoreBrackets.hasPendingEvent(BVH_CNT))
- Wait.BvhCnt = 0;
- }
-
- return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
- OldWaitcntInstr);
-}
-
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
- AMDGPU::Waitcnt Wait;
-
- unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
- unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
- unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
- if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
- return false;
-
- if (LoadCntPending != 0)
- Wait.LoadCnt = 0;
- if (SampleCntPending != 0)
- Wait.SampleCnt = 0;
- if (BvhCntPending != 0)
- Wait.BvhCnt = 0;
-
- return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
- OldWaitcntInstr);
-}
-
-bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
- MachineBasicBlock::instr_iterator It,
- MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
- bool Modified = false;
-
- if (OldWaitcntInstr)
- // Try to merge the required wait with preexisting waitcnt instructions.
- // Also erase redundant waitcnt.
- Modified =
- WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
-
- // Any counts that could have been applied to any existing waitcnt
- // instructions will have been done so, now deal with any remaining.
- ScoreBrackets.applyWaitcnt(Wait);
-
- // ExpCnt can be merged into VINTERP.
- if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
- SIInstrInfo::isVINTERP(*It)) {
- MachineOperand *WaitExp =
- TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
- if (Wait.ExpCnt < WaitExp->getImm()) {
- WaitExp->setImm(Wait.ExpCnt);
- Modified = true;
- }
- Wait.ExpCnt = ~0u;
-
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
- << "Update Instr: " << *It);
- }
-
- if (WCG->createNewWaitcnt(Block, It, Wait))
- Modified = true;
-
- return Modified;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens other
-// than LDS. Other address spaces supported by flat memory operations involve
-// global memory.
-bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // All flat instructions use the VMEM counter.
- assert(TII->usesVM_CNT(MI));
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access VMEM.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves VMEM.
- // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
- // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
- // (GDS) address space is not supported by flat operations. Therefore, simply
- // return true unless only the LDS address space is found.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- assert(AS != AMDGPUAS::REGION_ADDRESS);
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- return true;
- }
-
- return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either LDS or FLAT.
-bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
- if (!TII->usesLGKM_CNT(MI))
- return false;
-
- // If in tgsplit mode then there can be no use of LDS.
- if (ST->isTgSplitEnabled())
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access LDS.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves LDS.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
- return true;
- }
-
- return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
- const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // SCRATCH instructions always access scratch.
- if (TII->isFLATScratch(MI))
- return true;
-
- // GLOBAL instructions never access scratch.
- if (TII->isFLATGlobal(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access scratch.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves scratch.
- return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
- unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- });
-}
-
-static bool isCacheInvOrWBInst(MachineInstr &Inst) {
- auto Opc = Inst.getOpcode();
- return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
- Opc == AMDGPU::GLOBAL_WBINV;
-}
-
-void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
- WaitcntBrackets *ScoreBrackets) {
- // Now look at the instruction opcode. If it is a memory access
- // instruction, update the upper-bound of the appropriate counter's
- // bracket and the destination operand scores.
- // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
-
- if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
- if (TII->isAlwaysGDS(Inst.getOpcode()) ||
- TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
- ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
- } else {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
- }
- } else if (TII->isFLAT(Inst)) {
- // TODO: Track this properly.
- if (isCacheInvOrWBInst(Inst))
- return;
-
- assert(Inst.mayLoadOrStore());
-
- int FlatASCount = 0;
-
- if (mayAccessVMEMThroughFlat(Inst)) {
- ++FlatASCount;
- ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
- Inst);
- }
-
- if (mayAccessLDSThroughFlat(Inst)) {
- ++FlatASCount;
- ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
- }
-
- // A Flat memory operation must access at least one address space.
- assert(FlatASCount);
-
- // This is a flat memory operation that access both VMEM and LDS, so note it
- // - it will require that both the VM and LGKM be flushed to zero if it is
- // pending when a VM or LGKM dependency occurs.
- if (FlatASCount > 1)
- ScoreBrackets->setPendingFlat();
- } else if (SIInstrInfo::isVMEM(Inst) &&
- !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
- Inst);
-
- if (ST->vmemWriteNeedsExpWaitcnt() &&
- (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
- }
- } else if (TII->isSMRD(Inst)) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
- } else if (Inst.isCall()) {
- if (callWaitsOnFunctionReturn(Inst)) {
- // Act as a wait on everything
- ScoreBrackets->applyWaitcnt(
- WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
- ScoreBrackets->setStateOnFunctionEntryOrReturn();
- } else {
- // May need to way wait for anything.
- ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
- }
- } else if (SIInstrInfo::isLDSDIR(Inst)) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
- } else if (TII->isVINTERP(Inst)) {
- int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
- ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
- } else if (SIInstrInfo::isEXP(Inst)) {
- unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
- if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
- else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
- else
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
- } else {
- switch (Inst.getOpcode()) {
- case AMDGPU::S_SENDMSG:
- case AMDGPU::S_SENDMSG_RTN_B32:
- case AMDGPU::S_SENDMSG_RTN_B64:
- case AMDGPU::S_SENDMSGHALT:
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
- break;
- case AMDGPU::S_MEMTIME:
- case AMDGPU::S_MEMREALTIME:
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
- case AMDGPU::S_BARRIER_LEAVE:
- case AMDGPU::S_GET_BARRIER_STATE_M0:
- case AMDGPU::S_GET_BARRIER_STATE_IMM:
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
- break;
- }
- }
-}
-
-bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
- unsigned OtherScore) {
- unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
- unsigned OtherShifted =
- OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
- Score = std::max(MyShifted, OtherShifted);
- return OtherShifted > MyShifted;
-}
+ Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
-/// Merge the pending events and associater score brackets of \p Other into
-/// this brackets status.
-///
-/// Returns whether the merge resulted in a change that requires tighter waits
-/// (i.e. the merged brackets strictly dominate the original brackets).
-bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
- bool StrictDom = false;
-
- VgprUB = std::max(VgprUB, Other.VgprUB);
- SgprUB = std::max(SgprUB, Other.SgprUB);
-
- for (auto T : inst_counter_types(MaxCounter)) {
- // Merge event flags for this counter
- const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
- const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
- if (OtherEvents & ~OldEvents)
- StrictDom = true;
- PendingEvents |= OtherEvents;
-
- // Merge scores for this counter
- const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
- const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
- const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
- if (NewUB < ScoreLBs[T])
- report_fatal_error("waitcnt score overflow");
-
- MergeInfo M;
- M.OldLB = ScoreLBs[T];
- M.OtherLB = Other.ScoreLBs[T];
- M.MyShift = NewUB - ScoreUBs[T];
- M.OtherShift = NewUB - Other.ScoreUBs[T];
-
- ScoreUBs[T] = NewUB;
-
- StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
-
- for (int J = 0; J <= VgprUB; J++)
- StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
-
- if (T == SmemAccessCounter) {
- for (int J = 0; J <= SgprUB; J++)
- StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
- }
- }
+ if (ForceEmitWaitcnt[LOAD_CNT])
+ Wait.LoadCnt = 0;
+ if (ForceEmitWaitcnt[EXP_CNT])
+ Wait.ExpCnt = 0;
+ if (ForceEmitWaitcnt[DS_CNT])
+ Wait.DsCnt = 0;
+ if (ForceEmitWaitcnt[SAMPLE_CNT])
+ Wait.SampleCnt = 0;
+ if (ForceEmitWaitcnt[BVH_CNT])
+ Wait.BvhCnt = 0;
+ if (ForceEmitWaitcnt[KM_CNT])
+ Wait.KmCnt = 0;
- for (int J = 0; J <= VgprUB; J++) {
- unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
- StrictDom |= NewVmemTypes != VgprVmemTypes[J];
- VgprVmemTypes[J] = NewVmemTypes;
+ if (FlushVmCnt) {
+ if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+ Wait.LoadCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+ Wait.SampleCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+ Wait.BvhCnt = 0;
}
- return StrictDom;
-}
-
-static bool isWaitInstr(MachineInstr &Inst) {
- unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
- return Opcode == AMDGPU::S_WAITCNT ||
- (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
- Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
- Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
- Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
- counterTypeForInstr(Opcode).has_value();
+ return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
+ OldWaitcntInstr);
}
// Generate s_waitcnt instructions where needed.
-bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
- MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets) {
+bool SIWaitCntsInserter::insertWaitcntInBlock(MachineFunction &MF,
+ MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ VGPRInstsSet *VGPRInsts) {
bool Modified = false;
LLVM_DEBUG({
@@ -2265,7 +637,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
- FlushVmCnt);
+ FlushVmCnt, VGPRInsts);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
@@ -2355,324 +727,165 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
return Modified;
}
-// Return true if the given machine basic block is a preheader of a loop in
-// which we want to flush the vmcnt counter, and false otherwise.
-bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
- WaitcntBrackets &ScoreBrackets) {
- auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
- if (!IsInserted)
- return Iterator->second;
-
- MachineBasicBlock *Succ = MBB.getSingleSuccessor();
- if (!Succ)
- return false;
-
- MachineLoop *Loop = MLI->getLoopFor(Succ);
- if (!Loop)
- return false;
+void SIWaitCntsInserter::updateEventWaitcntAfter(
+ MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) {
+ // Now look at the instruction opcode. If it is a memory access
+ // instruction, update the upper-bound of the appropriate counter's
+ // bracket and the destination operand scores.
+ // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
- if (Loop->getLoopPreheader() == &MBB &&
- shouldFlushVmCnt(Loop, ScoreBrackets)) {
- Iterator->second = true;
- return true;
- }
+ if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
+ if (TII->isAlwaysGDS(Inst.getOpcode()) ||
+ TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+ } else {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+ }
+ } else if (TII->isFLAT(Inst)) {
+ // TODO: Track this properly.
+ if (isCacheInvOrWBInst(Inst))
+ return;
- return false;
-}
+ assert(Inst.mayLoadOrStore());
-bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
- return SIInstrInfo::isVMEM(MI) ||
- (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
-}
+ int FlatASCount = 0;
-// Return true if it is better to flush the vmcnt counter in the preheader of
-// the given loop. We currently decide to flush in two situations:
-// 1. The loop contains vmem store(s), no vmem load and at least one use of a
-// vgpr containing a value that is loaded outside of the loop. (Only on
-// targets with no vscnt counter).
-// 2. The loop contains vmem load(s), but the loaded values are not used in the
-// loop, and at least one use of a vgpr containing a value that is loaded
-// outside of the loop.
-bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
- WaitcntBrackets &Brackets) {
- bool HasVMemLoad = false;
- bool HasVMemStore = false;
- bool UsesVgprLoadedOutside = false;
- DenseSet<Register> VgprUse;
- DenseSet<Register> VgprDef;
+ if (mayAccessVMEMThroughFlat(Inst)) {
+ ++FlatASCount;
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
+ Inst);
+ }
- for (MachineBasicBlock *MBB : ML->blocks()) {
- for (MachineInstr &MI : *MBB) {
- if (isVMEMOrFlatVMEM(MI)) {
- if (MI.mayLoad())
- HasVMemLoad = true;
- if (MI.mayStore())
- HasVMemStore = true;
- }
- for (unsigned I = 0; I < MI.getNumOperands(); I++) {
- MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
- continue;
- RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
- // Vgpr use
- if (Op.isUse()) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- // If we find a register that is loaded inside the loop, 1. and 2.
- // are invalidated and we can exit.
- if (VgprDef.contains(RegNo))
- return false;
- VgprUse.insert(RegNo);
- // If at least one of Op's registers is in the score brackets, the
- // value is likely loaded outside of the loop.
- if (Brackets.getRegScore(RegNo, LOAD_CNT) >
- Brackets.getScoreLB(LOAD_CNT) ||
- Brackets.getRegScore(RegNo, SAMPLE_CNT) >
- Brackets.getScoreLB(SAMPLE_CNT) ||
- Brackets.getRegScore(RegNo, BVH_CNT) >
- Brackets.getScoreLB(BVH_CNT)) {
- UsesVgprLoadedOutside = true;
- break;
- }
- }
- }
- // VMem load vgpr def
- else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- // If we find a register that is loaded inside the loop, 1. and 2.
- // are invalidated and we can exit.
- if (VgprUse.contains(RegNo))
- return false;
- VgprDef.insert(RegNo);
- }
- }
+ if (mayAccessLDSThroughFlat(Inst)) {
+ ++FlatASCount;
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
- }
- if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
- return true;
- return HasVMemLoad && UsesVgprLoadedOutside;
-}
-bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
- ST = &MF.getSubtarget<GCNSubtarget>();
- TII = ST->getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- MLI = &getAnalysis<MachineLoopInfo>();
- PDT = &getAnalysis<MachinePostDominatorTree>();
- if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
- AA = &AAR->getAAResults();
+ // A Flat memory operation must access at least one address space.
+ assert(FlatASCount);
- AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+ // This is a flat memory operation that access both VMEM and LDS, so note it
+ // - it will require that both the VM and LGKM be flushed to zero if it is
+ // pending when a VM or LGKM dependency occurs.
+ if (FlatASCount > 1)
+ ScoreBrackets->setPendingFlat();
+ } else if (SIInstrInfo::isVMEM(Inst) &&
+ !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
+ Inst);
- if (ST->hasExtendedWaitCounts()) {
- MaxCounter = NUM_EXTENDED_INST_CNTS;
- WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
- WCG = &WCGGFX12Plus;
+ if (ST->vmemWriteNeedsExpWaitcnt() &&
+ (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+ }
+ } else if (TII->isSMRD(Inst)) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+ } else if (Inst.isCall()) {
+ if (callWaitsOnFunctionReturn(Inst)) {
+ // Act as a wait on everything
+ ScoreBrackets->applyWaitcnt(
+ WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
+ ScoreBrackets->setStateOnFunctionEntryOrReturn();
+ } else {
+ // May need to way wait for anything.
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+ }
+ } else if (SIInstrInfo::isLDSDIR(Inst)) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+ } else if (TII->isVINTERP(Inst)) {
+ int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
+ ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
+ } else if (SIInstrInfo::isEXP(Inst)) {
+ unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+ if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+ else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+ else
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
} else {
- MaxCounter = NUM_NORMAL_INST_CNTS;
- WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
- WCG = &WCGPreGFX12;
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSG_RTN_B32:
+ case AMDGPU::S_SENDMSG_RTN_B64:
+ case AMDGPU::S_SENDMSGHALT:
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+ break;
+ case AMDGPU::S_MEMTIME:
+ case AMDGPU::S_MEMREALTIME:
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+ case AMDGPU::S_BARRIER_LEAVE:
+ case AMDGPU::S_GET_BARRIER_STATE_M0:
+ case AMDGPU::S_GET_BARRIER_STATE_IMM:
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+ break;
+ }
}
+}
- ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
- for (auto T : inst_counter_types())
- ForceEmitWaitcnt[T] = false;
-
- const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
-
- SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
-
- OptNone = MF.getFunction().hasOptNone() ||
- MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
+class SIInsertWaitcnts : public MachineFunctionPass {
+public:
+ static char ID;
- HardwareLimits Limits = {};
- if (ST->hasExtendedWaitCounts()) {
- Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
- Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
- } else {
- Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
- Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
+ SIInsertWaitcnts() : MachineFunctionPass(ID) {
+ (void)ForceExpCounter;
+ (void)ForceLgkmCounter;
+ (void)ForceVMCounter;
}
- Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
- Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
- Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
- Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
- Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
-
- unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
- unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
- assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
- assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
-
- RegisterEncoding Encoding = {};
- Encoding.VGPR0 =
- TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
- Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
- Encoding.SGPR0 =
- TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
- Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
-
- BlockInfos.clear();
- bool Modified = false;
-
- MachineBasicBlock &EntryBB = MF.front();
- MachineBasicBlock::iterator I = EntryBB.begin();
-
- if (!MFI->isEntryFunction()) {
- // Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to do the wait after the
- // costly call sequence.
-
- // TODO: Could insert earlier and schedule more liberally with operations
- // that only use caller preserved registers.
- for (MachineBasicBlock::iterator E = EntryBB.end();
- I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
- ;
-
- if (ST->hasExtendedWaitCounts()) {
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
- .addImm(0);
- for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
- continue;
- BuildMI(EntryBB, I, DebugLoc(),
- TII->get(instrsForExtendedCounterTypes[CT]))
- .addImm(0);
- }
- } else {
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
- }
+ bool runOnMachineFunction(MachineFunction &MF) override;
- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
- SmemAccessCounter);
- NonKernelInitialState->setStateOnFunctionEntryOrReturn();
- BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+ StringRef getPassName() const override {
+ return "SI Insert Wait Instructions";
+ }
- Modified = true;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addUsedIfAvailable<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
}
+};
- // Keep iterating over the blocks in reverse post order, inserting and
- // updating s_waitcnt where needed, until a fix point is reached.
- for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
- BlockInfos.insert({MBB, BlockInfo()});
-
- std::unique_ptr<WaitcntBrackets> Brackets;
- bool Repeat;
- do {
- Repeat = false;
-
- for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
- ++BII) {
- MachineBasicBlock *MBB = BII->first;
- BlockInfo &BI = BII->second;
- if (!BI.Dirty)
- continue;
-
- if (BI.Incoming) {
- if (!Brackets)
- Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
- else
- *Brackets = *BI.Incoming;
- } else {
- if (!Brackets)
- Brackets = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
- SmemAccessCounter);
- else
- *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
- WaitEventMaskForInst, SmemAccessCounter);
- }
+INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE,
+ "SI Insert Wait Instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Wait Instructions",
+ false, false)
- Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
- BI.Dirty = false;
-
- if (Brackets->hasPendingEvent()) {
- BlockInfo *MoveBracketsToSucc = nullptr;
- for (MachineBasicBlock *Succ : MBB->successors()) {
- auto SuccBII = BlockInfos.find(Succ);
- BlockInfo &SuccBI = SuccBII->second;
- if (!SuccBI.Incoming) {
- SuccBI.Dirty = true;
- if (SuccBII <= BII)
- Repeat = true;
- if (!MoveBracketsToSucc) {
- MoveBracketsToSucc = &SuccBI;
- } else {
- SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
- }
- } else if (SuccBI.Incoming->merge(*Brackets)) {
- SuccBI.Dirty = true;
- if (SuccBII <= BII)
- Repeat = true;
- }
- }
- if (MoveBracketsToSucc)
- MoveBracketsToSucc->Incoming = std::move(Brackets);
- }
- }
- } while (Repeat);
+char SIInsertWaitcnts::ID = 0;
- if (ST->hasScalarStores()) {
- SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
- bool HaveScalarStores = false;
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (!HaveScalarStores && TII->isScalarStore(MI))
- HaveScalarStores = true;
+FunctionPass *llvm::createSIInsertWaitcntsPass() {
+ return new SIInsertWaitcnts();
+}
- if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
- EndPgmBlocks.push_back(&MBB);
- }
- }
+bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+ AliasAnalysis *AA = nullptr;
+ if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+ AA = &AAR->getAAResults();
- if (HaveScalarStores) {
- // If scalar writes are used, the cache must be flushed or else the next
- // wave to reuse the same scratch memory can be clobbered.
- //
- // Insert s_dcache_wb at wave termination points if there were any scalar
- // stores, and only if the cache hasn't already been flushed. This could
- // be improved by looking across blocks for flushes in postdominating
- // blocks from the stores but an explicitly requested flush is probably
- // very rare.
- for (MachineBasicBlock *MBB : EndPgmBlocks) {
- bool SeenDCacheWB = false;
-
- for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
- I != E; ++I) {
- if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
- SeenDCacheWB = true;
- else if (TII->isScalarStore(*I))
- SeenDCacheWB = false;
-
- // FIXME: It would be better to insert this before a waitcnt if any.
- if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
- I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
- !SeenDCacheWB) {
- Modified = true;
- BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
- }
- }
- }
- }
- }
+ WaitCntGeneratorPreGFX12 WCGPreGFX12;
+ WaitCntGeneratorGFX12Plus WCGGFX12Plus;
+ InstCounterType MaxCounter;
+ WaitCntGenerator *WCG =
+ getWaitCntGenerator(MF, WCGPreGFX12, WCGGFX12Plus, MaxCounter);
- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
- // instructions.
- for (MachineInstr *MI : ReleaseVGPRInsts) {
- if (ST->requiresNopBeforeDeallocVGPRs()) {
- BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
- .addImm(0);
- }
- BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
- .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
- Modified = true;
- }
- ReleaseVGPRInsts.clear();
+ SIWaitCntsInserter WCountsInserter = SIWaitCntsInserter(
+ ST, &MF.getRegInfo(), WCG, MaxCounter, ForceEmitZeroFlag, MLI, PDT, AA);
- return Modified;
+ // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
+ // message.
+ DenseSet<MachineInstr *> ReleaseVGPRInsts;
+
+ return WCountsInserter.insertWaitCntsInFunction(MF, &ReleaseVGPRInsts);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6826cd27319507..a347cdd2241d66 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -14,6 +14,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/TargetParser/TargetParser.h"
#include <array>
#include <functional>
#include <utility>
@@ -40,8 +41,6 @@ struct kernel_descriptor_t;
namespace AMDGPU {
-struct IsaVersion;
-
/// Generic target versions emitted by this version of LLVM.
///
/// These numbers are incremented every time a codegen breaking change occurs
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp
new file mode 100644
index 00000000000000..e332a648e59418
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.cpp
@@ -0,0 +1,1393 @@
+//===-- AMDGPUWaitCountUtils.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Common interface to insert various wait counts for memory operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUWaitCountUtils.h"
+#include "AMDGPU.h"
+#include "AMDGPUBaseInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+#define DEBUG_TYPE "amdgpu-waitcount-utils"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+namespace llvm {
+
+namespace AMDGPU {
+
+static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
+ unsigned NewEnc) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+ assert(OpIdx >= 0);
+
+ MachineOperand &MO = MI.getOperand(OpIdx);
+
+ if (NewEnc == MO.getImm())
+ return false;
+
+ MO.setImm(NewEnc);
+ return true;
+}
+
+/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
+/// and if so, which counter it is waiting on.
+static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
+ switch (Opcode) {
+ case AMDGPU::S_WAIT_LOADCNT:
+ return LOAD_CNT;
+ case AMDGPU::S_WAIT_EXPCNT:
+ return EXP_CNT;
+ case AMDGPU::S_WAIT_STORECNT:
+ return STORE_CNT;
+ case AMDGPU::S_WAIT_SAMPLECNT:
+ return SAMPLE_CNT;
+ case AMDGPU::S_WAIT_BVHCNT:
+ return BVH_CNT;
+ case AMDGPU::S_WAIT_DSCNT:
+ return DS_CNT;
+ case AMDGPU::S_WAIT_KMCNT:
+ return KM_CNT;
+ default:
+ return {};
+ }
+}
+
+bool updateVMCntOnly(const MachineInstr &Inst) {
+ return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
+ SIInstrInfo::isFLATScratch(Inst);
+}
+
+bool isWaitInstr(MachineInstr &Inst) {
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
+ return Opcode == AMDGPU::S_WAITCNT ||
+ (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
+ Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
+ Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ counterTypeForInstr(Opcode).has_value();
+}
+
+VmemType getVmemType(const MachineInstr &Inst) {
+ assert(updateVMCntOnly(Inst));
+ if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
+ !SIInstrInfo::isVSAMPLE(Inst))
+ return VMEM_NOSAMPLER;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ return BaseInfo->BVH ? VMEM_BVH
+ : BaseInfo->Sampler ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
+}
+
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+ // Currently all conventions wait, but this may not always be the case.
+ //
+ // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+ // senses to omit the wait and do it in the caller.
+ return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
+
+// Mapping from event to counter according to the table masks.
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+ for (auto T : inst_counter_types()) {
+ if (masks[T] & (1 << E))
+ return T;
+ }
+ llvm_unreachable("event type has no associated counter");
+}
+
+bool readsVCCZ(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+ !MI.getOperand(1).isUndef();
+}
+
+bool isCacheInvOrWBInst(MachineInstr &Inst) {
+ auto Opc = Inst.getOpcode();
+ return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+ Opc == AMDGPU::GLOBAL_WBINV;
+}
+
+#ifndef NDEBUG
+static bool isNormalMode(InstCounterType MaxCounter) {
+ return MaxCounter == NUM_NORMAL_INST_CNTS;
+}
+#endif // NDEBUG
+
+unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ switch (T) {
+ case LOAD_CNT:
+ return Wait.LoadCnt;
+ case EXP_CNT:
+ return Wait.ExpCnt;
+ case DS_CNT:
+ return Wait.DsCnt;
+ case STORE_CNT:
+ return Wait.StoreCnt;
+ case SAMPLE_CNT:
+ return Wait.SampleCnt;
+ case BVH_CNT:
+ return Wait.BvhCnt;
+ case KM_CNT:
+ return Wait.KmCnt;
+ default:
+ llvm_unreachable("bad InstCounterType");
+ }
+}
+
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+ unsigned &WC = getCounterRef(Wait, T);
+ WC = std::min(WC, Count);
+}
+
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ getCounterRef(Wait, T) = ~0u;
+}
+
+unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ return getCounterRef(Wait, T);
+}
+
+WaitCntGenerator *getWaitCntGenerator(MachineFunction &MF,
+ WaitCntGeneratorPreGFX12 &WCGPreGFX12,
+ WaitCntGeneratorGFX12Plus &WCGGFX12Plus,
+ InstCounterType &MaxCounter) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+ WaitCntGenerator *WCG = nullptr;
+
+ if (ST->hasExtendedWaitCounts()) {
+ MaxCounter = NUM_EXTENDED_INST_CNTS;
+ WCGGFX12Plus = WaitCntGeneratorGFX12Plus(ST, MaxCounter);
+ WCG = &WCGGFX12Plus;
+ } else {
+ MaxCounter = NUM_NORMAL_INST_CNTS;
+ WCGPreGFX12 = WaitCntGeneratorPreGFX12(ST);
+ WCG = &WCGPreGFX12;
+ }
+
+ return WCG;
+}
+
+//===----------------------------------------------------------------------===//
+// WaitcntBrackets member functions.
+//===----------------------------------------------------------------------===//
+
+RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI,
+ unsigned OpNo) const {
+ const MachineOperand &Op = MI->getOperand(OpNo);
+ if (!TRI->isInAllocatableClass(Op.getReg()))
+ return {-1, -1};
+
+ // A use via a PW operand does not need a waitcnt.
+ // A partial write is not a WAW.
+ assert(!Op.getSubReg() || !Op.isUndef());
+
+ RegInterval Result;
+
+ unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
+ AMDGPU::HWEncoding::REG_IDX_MASK;
+
+ if (TRI->isVectorRegister(*MRI, Op.getReg())) {
+ assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
+ Result.first = Reg - Encoding.VGPR0;
+ if (TRI->isAGPR(*MRI, Op.getReg()))
+ Result.first += AGPR_OFFSET;
+ assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+ } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+ assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+ Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
+ assert(Result.first >= NUM_ALL_VGPRS &&
+ Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+ }
+ // TODO: Handle TTMP
+ // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
+ else
+ return {-1, -1};
+
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ Result.second = Result.first + ((Size + 16) / 32);
+
+ return Result;
+}
+
+void WaitcntBrackets::setExpScore(const MachineInstr *MI,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, unsigned OpNo,
+ unsigned Val) {
+ auto [RegLow, RegHigh] = getRegInterval(MI, MRI, TRI, OpNo);
+ assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
+ for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+ setRegScore(RegNo, EXP_CNT, Val);
+ }
+}
+
+void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI,
+ WaitEventType E, MachineInstr &Inst) {
+ InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+
+ unsigned UB = getScoreUB(T);
+ unsigned CurrScore = UB + 1;
+ if (CurrScore == 0)
+ report_fatal_error("InsertWaitcnt score wraparound");
+ // PendingEvents and ScoreUB need to be update regardless if this event
+ // changes the score of a register or not.
+ // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
+ PendingEvents |= 1 << E;
+ setScoreUB(T, CurrScore);
+
+ if (T == EXP_CNT) {
+ // Put score on the source vgprs. If this is a store, just use those
+ // specific register(s).
+ if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+ int AddrOpIdx =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
+ // All GDS operations must protect their address register (same as
+ // export.)
+ if (AddrOpIdx != -1) {
+ setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
+ }
+
+ if (Inst.mayStore()) {
+ if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
+ setExpScore(&Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::data0),
+ CurrScore);
+ }
+ if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
+ setExpScore(&Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::data1),
+ CurrScore);
+ }
+ } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
+ Inst.getOpcode() != AMDGPU::DS_APPEND &&
+ Inst.getOpcode() != AMDGPU::DS_CONSUME &&
+ Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
+ for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ const MachineOperand &Op = Inst.getOperand(I);
+ if (Op.isReg() && !Op.isDef() &&
+ TRI->isVectorRegister(*MRI, Op.getReg())) {
+ setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+ }
+ }
+ }
+ } else if (TII->isFLAT(Inst)) {
+ if (Inst.mayStore()) {
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+ CurrScore);
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+ CurrScore);
+ }
+ } else if (TII->isMIMG(Inst)) {
+ if (Inst.mayStore()) {
+ setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+ CurrScore);
+ }
+ } else if (TII->isMTBUF(Inst)) {
+ if (Inst.mayStore()) {
+ setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+ }
+ } else if (TII->isMUBUF(Inst)) {
+ if (Inst.mayStore()) {
+ setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+ CurrScore);
+ }
+ } else if (TII->isLDSDIR(Inst)) {
+ // LDSDIR instructions attach the score to the destination.
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
+ CurrScore);
+ } else {
+ if (TII->isEXP(Inst)) {
+ // For export the destination registers are really temps that
+ // can be used as the actual source after export patching, so
+ // we need to treat them like sources and set the EXP_CNT
+ // score.
+ for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ MachineOperand &DefMO = Inst.getOperand(I);
+ if (DefMO.isReg() && DefMO.isDef() &&
+ TRI->isVGPR(*MRI, DefMO.getReg())) {
+ setRegScore(
+ TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
+ EXP_CNT, CurrScore);
+ }
+ }
+ }
+ for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ MachineOperand &MO = Inst.getOperand(I);
+ if (MO.isReg() && !MO.isDef() &&
+ TRI->isVectorRegister(*MRI, MO.getReg())) {
+ setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+ }
+ }
+ }
+#if 0 // TODO: check if this is handled by MUBUF code above.
+ } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
+ Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+ Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+ MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
+ unsigned OpNo;//TODO: find the OpNo for this operand;
+ auto [RegLow, RegHigh] = getRegInterval(&Inst, MRI, TRI, OpNo);
+ for (int RegNo = RegLow; RegNo < RegHigh;
+ ++RegNo) {
+ setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
+ }
+#endif
+ } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
+ // Match the score to the destination registers.
+ for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ auto &Op = Inst.getOperand(I);
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ auto [RegLow, RegHigh] = getRegInterval(&Inst, MRI, TRI, I);
+ if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
+ if (RegLow >= NUM_ALL_VGPRS)
+ continue;
+ if (updateVMCntOnly(Inst)) {
+ // updateVMCntOnly should only leave us with VGPRs
+ // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
+ // defs. That's required for a sane index into `VgprMemTypes` below
+ assert(TRI->isVectorRegister(*MRI, Op.getReg()));
+ VmemType V = getVmemType(Inst);
+ for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo)
+ VgprVmemTypes[RegNo] |= 1 << V;
+ }
+ }
+ for (int RegNo = RegLow; RegNo < RegHigh; ++RegNo) {
+ setRegScore(RegNo, T, CurrScore);
+ }
+ }
+ if (Inst.mayStore() &&
+ (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+ // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+ // written can be accessed. A load from LDS to VMEM does not need a wait.
+ unsigned Slot = 0;
+ for (const auto *MemOp : Inst.memoperands()) {
+ if (!MemOp->isStore() ||
+ MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ // Comparing just AA info does not guarantee memoperands are equal
+ // in general, but this is so for LDS DMA in practice.
+ auto AAI = MemOp->getAAInfo();
+ // Alias scope information gives a way to definitely identify an
+ // original memory object and practically produced in the module LDS
+ // lowering pass. If there is no scope available we will not be able
+ // to disambiguate LDS aliasing as after the module lowering all LDS
+ // is squashed into a single big object. Do not attempt to use one of
+ // the limited LDSDMAStores for something we will not be able to use
+ // anyway.
+ if (!AAI || !AAI.Scope)
+ break;
+ for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
+ for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
+ if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
+ Slot = I + 1;
+ break;
+ }
+ }
+ }
+ if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+ break;
+ LDSDMAStores.push_back(&Inst);
+ Slot = LDSDMAStores.size();
+ break;
+ }
+ setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
+ if (Slot)
+ setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+ }
+ }
+}
+
+void WaitcntBrackets::print(raw_ostream &OS) {
+ OS << '\n';
+ for (auto T : inst_counter_types(MaxCounter)) {
+ unsigned SR = getScoreRange(T);
+
+ switch (T) {
+ case LOAD_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
+ << SR << "): ";
+ break;
+ case DS_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
+ << SR << "): ";
+ break;
+ case EXP_CNT:
+ OS << " EXP_CNT(" << SR << "): ";
+ break;
+ case STORE_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
+ << SR << "): ";
+ break;
+ case SAMPLE_CNT:
+ OS << " SAMPLE_CNT(" << SR << "): ";
+ break;
+ case BVH_CNT:
+ OS << " BVH_CNT(" << SR << "): ";
+ break;
+ case KM_CNT:
+ OS << " KM_CNT(" << SR << "): ";
+ break;
+ default:
+ OS << " UNKNOWN(" << SR << "): ";
+ break;
+ }
+
+ if (SR != 0) {
+ // Print vgpr scores.
+ unsigned LB = getScoreLB(T);
+
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned RegScore = getRegScore(J, T);
+ if (RegScore <= LB)
+ continue;
+ unsigned RelScore = RegScore - LB - 1;
+ if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
+ OS << RelScore << ":v" << J << " ";
+ } else {
+ OS << RelScore << ":ds ";
+ }
+ }
+ // Also need to print sgpr scores for lgkm_cnt.
+ if (T == SmemAccessCounter) {
+ for (int J = 0; J <= SgprUB; J++) {
+ unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
+ if (RegScore <= LB)
+ continue;
+ unsigned RelScore = RegScore - LB - 1;
+ OS << RelScore << ":s" << J << " ";
+ }
+ }
+ }
+ OS << '\n';
+ }
+ OS << '\n';
+}
+
+/// Simplify the waitcnt, in the sense of removing redundant counts, and return
+/// whether a waitcnt instruction is needed at all.
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+ simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+ simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ simplifyWaitcnt(DS_CNT, Wait.DsCnt);
+ simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+ simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
+ simplifyWaitcnt(KM_CNT, Wait.KmCnt);
+}
+
+void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+ unsigned &Count) const {
+ // The number of outstanding events for this type, T, can be calculated
+ // as (UB - LB). If the current Count is greater than or equal to the number
+ // of outstanding events, then the wait for this counter is redundant.
+ if (Count >= getScoreRange(T))
+ Count = ~0u;
+}
+
+void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
+ AMDGPU::Waitcnt &Wait) const {
+ unsigned ScoreToWait = getRegScore(RegNo, T);
+
+ // If the score of src_operand falls within the bracket, we need an
+ // s_waitcnt instruction.
+ const unsigned LB = getScoreLB(T);
+ const unsigned UB = getScoreUB(T);
+ if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+ if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+ !ST->hasFlatLgkmVMemCountInOrder()) {
+ // If there is a pending FLAT operation, and this is a VMem or LGKM
+ // waitcnt and the target can report early completion, then we need
+ // to force a waitcnt 0.
+ addWait(Wait, T, 0);
+ } else if (counterOutOfOrder(T)) {
+ // Counter can get decremented out-of-order when there
+ // are multiple types event in the bracket. Also emit an s_wait counter
+ // with a conservative value of 0 for the counter.
+ addWait(Wait, T, 0);
+ } else {
+ // If a counter has been maxed out avoid overflow by waiting for
+ // MAX(CounterType) - 1 instead.
+ unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ addWait(Wait, T, NeededWait);
+ }
+ }
+}
+
+void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
+ applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+ applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ applyWaitcnt(DS_CNT, Wait.DsCnt);
+ applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+ applyWaitcnt(BVH_CNT, Wait.BvhCnt);
+ applyWaitcnt(KM_CNT, Wait.KmCnt);
+}
+
+void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+ const unsigned UB = getScoreUB(T);
+ if (Count >= UB)
+ return;
+ if (Count != 0) {
+ if (counterOutOfOrder(T))
+ return;
+ setScoreLB(T, std::max(getScoreLB(T), UB - Count));
+ } else {
+ setScoreLB(T, UB);
+ PendingEvents &= ~WaitEventMaskForInst[T];
+ }
+}
+
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+ // Scalar memory read always can go out of order.
+ if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
+ return true;
+ return hasMixedPendingEvents(T);
+}
+
+WaitCntBitMaskFn WaitcntBrackets::getWaitCntBitMaskFn(InstCounterType T) {
+ switch (T) {
+ case LOAD_CNT:
+ if (ST->hasExtendedWaitCounts())
+ return getLoadcntBitMask;
+
+ return getVmcntBitMask;
+ case DS_CNT:
+ if (ST->hasExtendedWaitCounts())
+ return getDscntBitMask;
+
+ return getLgkmcntBitMask;
+ case EXP_CNT:
+ return getExpcntBitMask;
+ case STORE_CNT:
+ return getStorecntBitMask;
+ case SAMPLE_CNT:
+ return getSamplecntBitMask;
+ case BVH_CNT:
+ return getBvhcntBitMask;
+ case KM_CNT:
+ return getKmcntBitMask;
+ default:
+ llvm_unreachable("bad InstCounterType in getWaitCntBitMaskFn");
+ }
+}
+
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore) {
+ unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+ unsigned OtherShifted =
+ OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
+ Score = std::max(MyShifted, OtherShifted);
+ return OtherShifted > MyShifted;
+}
+
+/// Merge the pending events and associater score brackets of \p Other into
+/// this brackets status.
+///
+/// Returns whether the merge resulted in a change that requires tighter waits
+/// (i.e. the merged brackets strictly dominate the original brackets).
+bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
+ bool StrictDom = false;
+
+ VgprUB = std::max(VgprUB, Other.VgprUB);
+ SgprUB = std::max(SgprUB, Other.SgprUB);
+
+ for (auto T : inst_counter_types(MaxCounter)) {
+ // Merge event flags for this counter
+ const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
+ const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+ if (OtherEvents & ~OldEvents)
+ StrictDom = true;
+ PendingEvents |= OtherEvents;
+
+ // Merge scores for this counter
+ const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
+ const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+ const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
+ if (NewUB < ScoreLBs[T])
+ report_fatal_error("waitcnt score overflow");
+
+ MergeInfo M;
+ M.OldLB = ScoreLBs[T];
+ M.OtherLB = Other.ScoreLBs[T];
+ M.MyShift = NewUB - ScoreUBs[T];
+ M.OtherShift = NewUB - Other.ScoreUBs[T];
+
+ ScoreUBs[T] = NewUB;
+
+ StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
+
+ for (int J = 0; J <= VgprUB; J++)
+ StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
+
+ if (isSmemCounter(T)) {
+ for (int J = 0; J <= SgprUB; J++)
+ StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+ }
+ }
+
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+ StrictDom |= NewVmemTypes != VgprVmemTypes[J];
+ VgprVmemTypes[J] = NewVmemTypes;
+ }
+
+ return StrictDom;
+}
+
+//===----------------------------------------------------------------------===//
+// WaitCntGeneratorPreGFX12 member functions.
+//===----------------------------------------------------------------------===//
+
+AMDGPU::Waitcnt
+WaitCntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+}
+
+/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
+/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
+/// from \p Wait that were added by previous passes. Currently this pass
+/// conservatively assumes that these preexisting waits are required for
+/// correctness.
+bool WaitCntGeneratorPreGFX12::applyPreexistingWaitcnt(
+ WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+ assert(ST);
+ assert(isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ MachineInstr *WaitcntInstr = nullptr;
+ MachineInstr *WaitcntVsCntInstr = nullptr;
+
+ for (auto &II :
+ make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+ if (II.isMetaInstruction())
+ continue;
+
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+ bool IsSoft = Opcode != II.getOpcode();
+
+ // Update required wait count. If this is a soft waitcnt (= it was added
+ // by an earlier pass), it may be entirely removed.
+ if (Opcode == AMDGPU::S_WAITCNT) {
+ unsigned IEnc = II.getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
+
+ // Merge consecutive waitcnt of the same type by erasing multiples.
+ if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
+ II.eraseFromParent();
+ Modified = true;
+ } else
+ WaitcntInstr = &II;
+ } else {
+ assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+
+ unsigned OldVSCnt =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
+ Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
+
+ if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
+ II.eraseFromParent();
+ Modified = true;
+ } else
+ WaitcntVsCntInstr = &II;
+ }
+ }
+
+ if (WaitcntInstr) {
+ Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
+ AMDGPU::encodeWaitcnt(IV, Wait));
+ Modified |= promoteSoftWaitCnt(WaitcntInstr);
+
+ ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+ ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+ Wait.LoadCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+ Wait.DsCnt = ~0u;
+
+ LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
+ ? dbgs()
+ << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
+ }
+
+ if (WaitcntVsCntInstr) {
+ Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
+ AMDGPU::OpName::simm16, Wait.StoreCnt);
+ Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
+
+ ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ Wait.StoreCnt = ~0u;
+
+ LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntVsCntInstr
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
+ }
+
+ return Modified;
+}
+
+/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
+/// required counters in \p Wait
+bool WaitCntGeneratorPreGFX12::createNewWaitcnt(
+ MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) {
+ assert(ST);
+ assert(isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ const DebugLoc &DL = Block.findDebugLoc(It);
+
+ // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
+ // single instruction while VScnt has its own instruction.
+ if (Wait.hasWaitExceptStoreCnt()) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ if (Wait.hasWaitStoreCnt()) {
+ assert(ST->hasVscnt());
+
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.StoreCnt);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
+}
+
+//===----------------------------------------------------------------------===//
+// WaitCntGeneratorGFX12Plus member functions.
+//===----------------------------------------------------------------------===//
+
+AMDGPU::Waitcnt
+WaitCntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+}
+
+/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
+/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
+/// were added by previous passes. Currently this pass conservatively
+/// assumes that these preexisting waits are required for correctness.
+bool WaitCntGeneratorGFX12Plus::applyPreexistingWaitcnt(
+ WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+ assert(ST);
+ assert(!isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ MachineInstr *CombinedLoadDsCntInstr = nullptr;
+ MachineInstr *CombinedStoreDsCntInstr = nullptr;
+ MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
+
+ for (auto &II :
+ make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+ if (II.isMetaInstruction())
+ continue;
+
+ MachineInstr **UpdatableInstr;
+
+ // Update required wait count. If this is a soft waitcnt (= it was added
+ // by an earlier pass), it may be entirely removed.
+
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+ bool IsSoft = Opcode != II.getOpcode();
+
+ if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
+ unsigned OldEnc =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
+ UpdatableInstr = &CombinedLoadDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
+ unsigned OldEnc =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
+ UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else {
+ std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
+ assert(CT.has_value());
+ unsigned OldCnt =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
+ addWait(Wait, CT.value(), OldCnt);
+ UpdatableInstr = &WaitInstrs[CT.value()];
+ }
+
+ // Merge consecutive waitcnt of the same type by erasing multiples.
+ if (!*UpdatableInstr) {
+ *UpdatableInstr = &II;
+ } else {
+ II.eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (CombinedLoadDsCntInstr) {
+ // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
+ // to be waited for. Otherwise, let the instruction be deleted so
+ // the appropriate single counter wait instruction can be inserted
+ // instead, when new S_WAIT_*CNT instructions are inserted by
+ // createNewWaitcnt(). As a side effect, resetting the wait counts will
+ // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
+ // the loop below that deals with single counter instructions.
+ if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
+ unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+ Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
+ AMDGPU::OpName::simm16, NewEnc);
+ Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
+ ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+ ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+ Wait.LoadCnt = ~0u;
+ Wait.DsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *CombinedLoadDsCntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *CombinedLoadDsCntInstr << '\n');
+ } else {
+ CombinedLoadDsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (CombinedStoreDsCntInstr) {
+ // Similarly for S_WAIT_STORECNT_DSCNT.
+ if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
+ unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+ Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
+ AMDGPU::OpName::simm16, NewEnc);
+ Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
+ ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+ Wait.StoreCnt = ~0u;
+ Wait.DsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *CombinedStoreDsCntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *CombinedStoreDsCntInstr << '\n');
+ } else {
+ CombinedStoreDsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ // Look for an opportunity to convert existing S_WAIT_LOADCNT,
+ // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
+ // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
+ // instructions so that createNewWaitcnt() will create new combined
+ // instructions to replace them.
+
+ if (Wait.DsCnt != ~0u) {
+ // This is a vector of addresses in WaitInstrs pointing to instructions
+ // that should be removed if they are present.
+ SmallVector<MachineInstr **, 2> WaitsToErase;
+
+ // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
+ // both) need to be waited for, ensure that there are no existing
+ // individual wait count instructions for these.
+
+ if (Wait.LoadCnt != ~0u) {
+ WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
+ WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+ } else if (Wait.StoreCnt != ~0u) {
+ WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
+ WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+ }
+
+ for (MachineInstr **WI : WaitsToErase) {
+ if (!*WI)
+ continue;
+
+ (*WI)->eraseFromParent();
+ *WI = nullptr;
+ Modified = true;
+ }
+ }
+
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ if (!WaitInstrs[CT])
+ continue;
+
+ unsigned NewCnt = getWait(Wait, CT);
+ if (NewCnt != ~0u) {
+ Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
+ AMDGPU::OpName::simm16, NewCnt);
+ Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
+
+ ScoreBrackets.applyWaitcnt(CT, NewCnt);
+ setNoWait(Wait, CT);
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitInstrs[CT]
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitInstrs[CT] << '\n');
+ } else {
+ WaitInstrs[CT]->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ return Modified;
+}
+
+/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
+bool WaitCntGeneratorGFX12Plus::createNewWaitcnt(
+ MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) {
+ assert(ST);
+ assert(!isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ const DebugLoc &DL = Block.findDebugLoc(It);
+
+ // Check for opportunities to use combined wait instructions.
+ if (Wait.DsCnt != ~0u) {
+ MachineInstr *SWaitInst = nullptr;
+
+ if (Wait.LoadCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+
+ SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+ .addImm(Enc);
+
+ Wait.LoadCnt = ~0u;
+ Wait.DsCnt = ~0u;
+ } else if (Wait.StoreCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+
+ SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
+ .addImm(Enc);
+
+ Wait.StoreCnt = ~0u;
+ Wait.DsCnt = ~0u;
+ }
+
+ if (SWaitInst) {
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+ }
+
+ // Generate an instruction for any remaining counter that needs
+ // waiting for.
+
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ unsigned Count = getWait(Wait, CT);
+ if (Count == ~0u)
+ continue;
+
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Count);
+
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUWaitCntInserter member functions.
+//===----------------------------------------------------------------------===//
+
+// This is a flat memory operation. Check to see if it has memory tokens other
+// than LDS. Other address spaces supported by flat memory operations involve
+// global memory.
+bool AMDGPUWaitCntInserter::mayAccessVMEMThroughFlat(
+ const MachineInstr &MI) const {
+ assert(TII->isFLAT(MI));
+
+ // All flat instructions use the VMEM counter.
+ assert(TII->usesVM_CNT(MI));
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access VMEM.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves VMEM.
+ // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+ // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+ // (GDS) address space is not supported by flat operations. Therefore, simply
+ // return true unless only the LDS address space is found.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ assert(AS != AMDGPUAS::REGION_ADDRESS);
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ return true;
+ }
+
+ return false;
+}
+
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either scratch or FLAT.
+bool AMDGPUWaitCntInserter::mayAccessScratchThroughFlat(
+ const MachineInstr &MI) const {
+ assert(TII->isFLAT(MI));
+
+ // SCRATCH instructions always access scratch.
+ if (TII->isFLATScratch(MI))
+ return true;
+
+ // GLOBAL instructions never access scratch.
+ if (TII->isFLATGlobal(MI))
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access scratch.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves scratch.
+ return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+ unsigned AS = Memop->getAddrSpace();
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ });
+}
+
+bool AMDGPUWaitCntInserter::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
+ return SIInstrInfo::isVMEM(MI) ||
+ (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
+}
+
+bool AMDGPUWaitCntInserter::generateWaitcnt(
+ AMDGPU::Waitcnt Wait, MachineBasicBlock::instr_iterator It,
+ MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
+ bool Modified = false;
+
+ if (OldWaitcntInstr)
+ // Try to merge the required wait with preexisting waitcnt instructions.
+ // Also erase redundant waitcnt.
+ Modified =
+ WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+
+ // Any counts that could have been applied to any existing waitcnt
+ // instructions will have been done so, now deal with any remaining.
+ ScoreBrackets.applyWaitcnt(Wait);
+
+ // ExpCnt can be merged into VINTERP.
+ if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
+ SIInstrInfo::isVINTERP(*It)) {
+ MachineOperand *WaitExp =
+ TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
+ if (Wait.ExpCnt < WaitExp->getImm()) {
+ WaitExp->setImm(Wait.ExpCnt);
+ Modified = true;
+ }
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
+ << "Update Instr: " << *It);
+ }
+
+ if (WCG->createNewWaitcnt(Block, It, Wait))
+ Modified = true;
+
+ return Modified;
+}
+
+// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
+// end of the given block if needed.
+bool AMDGPUWaitCntInserter::generateWaitcntBlockEnd(
+ MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
+ AMDGPU::Waitcnt Wait;
+
+ unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
+ unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
+ unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
+
+ if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
+ return false;
+
+ if (LoadCntPending != 0)
+ Wait.LoadCnt = 0;
+ if (SampleCntPending != 0)
+ Wait.SampleCnt = 0;
+ if (BvhCntPending != 0)
+ Wait.BvhCnt = 0;
+
+ return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+ OldWaitcntInstr);
+}
+
+bool AMDGPUWaitCntInserter::insertWaitCntsInFunction(MachineFunction &MF,
+ VGPRInstsSet *VGPRInsts) {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+ InstCounterType SmemAccessCounter =
+ eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
+
+ unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+ unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
+ assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+ assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+
+ RegisterEncoding Encoding = {};
+ Encoding.VGPR0 =
+ TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
+ Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
+ Encoding.SGPR0 =
+ TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
+ Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
+
+ MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
+ BlockInfos.clear();
+ bool Modified = false;
+
+ MachineBasicBlock &EntryBB = MF.front();
+ MachineBasicBlock::iterator I = EntryBB.begin();
+
+ if (!MFI->isEntryFunction()) {
+ // Wait for any outstanding memory operations that the input registers may
+ // depend on. We can't track them and it's better to do the wait after the
+ // costly call sequence.
+
+ // TODO: Could insert earlier and schedule more liberally with operations
+ // that only use caller preserved registers.
+ for (MachineBasicBlock::iterator E = EntryBB.end();
+ I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+ ;
+
+ if (ST->hasExtendedWaitCounts()) {
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+ .addImm(0);
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+ continue;
+
+ BuildMI(EntryBB, I, DebugLoc(),
+ TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(0);
+ }
+ } else {
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ }
+
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
+ ST, MaxCounter, Encoding, WaitEventMaskForInst, SmemAccessCounter);
+ NonKernelInitialState->setStateOnFunctionEntryOrReturn();
+ BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+
+ Modified = true;
+ }
+
+ // Keep iterating over the blocks in reverse post order, inserting and
+ // updating s_waitcnt where needed, until a fix point is reached.
+ for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
+ BlockInfos.insert({MBB, BlockInfo()});
+
+ std::unique_ptr<WaitcntBrackets> Brackets;
+ bool Repeat;
+ do {
+ Repeat = false;
+
+ for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
+ ++BII) {
+ MachineBasicBlock *MBB = BII->first;
+ BlockInfo &BI = BII->second;
+ if (!BI.Dirty)
+ continue;
+
+ if (BI.Incoming) {
+ if (!Brackets)
+ Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
+ else
+ *Brackets = *BI.Incoming;
+ } else {
+ if (!Brackets)
+ Brackets = std::make_unique<WaitcntBrackets>(ST, MaxCounter, Encoding,
+ WaitEventMaskForInst,
+ SmemAccessCounter);
+ else
+ *Brackets = WaitcntBrackets(ST, MaxCounter, Encoding,
+ WaitEventMaskForInst, SmemAccessCounter);
+ }
+
+ Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets, VGPRInsts);
+ BI.Dirty = false;
+
+ if (Brackets->hasPendingEvent()) {
+ BlockInfo *MoveBracketsToSucc = nullptr;
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ auto SuccBII = BlockInfos.find(Succ);
+ BlockInfo &SuccBI = SuccBII->second;
+ if (!SuccBI.Incoming) {
+ SuccBI.Dirty = true;
+ if (SuccBII <= BII)
+ Repeat = true;
+ if (!MoveBracketsToSucc) {
+ MoveBracketsToSucc = &SuccBI;
+ } else {
+ SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
+ }
+ } else if (SuccBI.Incoming->merge(*Brackets)) {
+ SuccBI.Dirty = true;
+ if (SuccBII <= BII)
+ Repeat = true;
+ }
+ }
+ if (MoveBracketsToSucc)
+ MoveBracketsToSucc->Incoming = std::move(Brackets);
+ }
+ }
+ } while (Repeat);
+
+ if (ST->hasScalarStores()) {
+ SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+ bool HaveScalarStores = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (!HaveScalarStores && TII->isScalarStore(MI))
+ HaveScalarStores = true;
+
+ if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+ MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+ EndPgmBlocks.push_back(&MBB);
+ }
+ }
+
+ if (HaveScalarStores) {
+ // If scalar writes are used, the cache must be flushed or else the next
+ // wave to reuse the same scratch memory can be clobbered.
+ //
+ // Insert s_dcache_wb at wave termination points if there were any scalar
+ // stores, and only if the cache hasn't already been flushed. This could
+ // be improved by looking across blocks for flushes in postdominating
+ // blocks from the stores but an explicitly requested flush is probably
+ // very rare.
+ for (MachineBasicBlock *MBB : EndPgmBlocks) {
+ bool SeenDCacheWB = false;
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+ if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+ SeenDCacheWB = true;
+ else if (TII->isScalarStore(*I))
+ SeenDCacheWB = false;
+
+ // FIXME: It would be better to insert this before a waitcnt if any.
+ if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+ !SeenDCacheWB) {
+ Modified = true;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ }
+ }
+ }
+ }
+ }
+
+ // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
+ // instructions.
+ for (MachineInstr *MI : *VGPRInsts) {
+ if (ST->requiresNopBeforeDeallocVGPRs()) {
+ BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ }
+ BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
+ .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ Modified = true;
+ }
+ VGPRInsts->clear();
+
+ return Modified;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h
new file mode 100644
index 00000000000000..b5f74c932672ba
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUWaitCountUtils.h
@@ -0,0 +1,531 @@
+//===- AMDGPUWaitCountUtils.h - Wait count insertion interface -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUWAITCOUNTUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUWAITCOUNTUTILS_H
+
+#include "GCNSubtarget.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Class of object that encapsulates latest instruction counter score
+// associated with the operand. Used for determining whether
+// s_waitcnt instruction needs to be emitted.
+enum InstCounterType : uint8_t {
+ CT_START = 0,
+ LOAD_CNT = CT_START, // VMcnt prior to gfx12.
+ DS_CNT, // LKGMcnt prior to gfx12.
+ EXP_CNT, //
+ STORE_CNT, // VScnt in gfx10/gfx11.
+ NUM_NORMAL_INST_CNTS,
+ SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+ BVH_CNT, // gfx12+ only.
+ KM_CNT, // gfx12+ only.
+ NUM_EXTENDED_INST_CNTS,
+ NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+};
+
+} // namespace AMDGPU
+
+using AMDGPU::InstCounterType;
+
+template <> struct enum_iteration_traits<InstCounterType> {
+ static constexpr bool is_iterable = true;
+};
+
+namespace AMDGPU {
+
+// Return an iterator over all counters between the first counter and \c
+// MaxCounter (exclusive, default value yields an enumeration over all
+// counters).
+inline auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
+ return enum_seq(CT_START, MaxCounter);
+}
+
+enum WaitEventType : uint8_t {
+ VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
+ VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
+ VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
+ SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
+ LDS_ACCESS, // lds read & write
+ GDS_ACCESS, // gds read & write
+ SQ_MESSAGE, // send message
+ SMEM_ACCESS, // scalar-memory read & write
+ EXP_GPR_LOCK, // export holding on its data src
+ GDS_GPR_LOCK, // GDS holding on its data and addr src
+ EXP_POS_ACCESS, // write to export position
+ EXP_PARAM_ACCESS, // write to export parameter
+ VMW_GPR_LOCK, // vector-memory write holding on its data src
+ EXP_LDS_ACCESS, // read by ldsdir counting as export
+ NUM_WAIT_EVENTS
+};
+using AMDGPU::WaitEventType;
+
+using RegInterval = std::pair<int, int>;
+
+struct RegisterEncoding {
+ unsigned VGPR0;
+ unsigned VGPRL;
+ unsigned SGPR0;
+ unsigned SGPRL;
+};
+
+// The mapping is:
+// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
+// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
+// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// We reserve a fixed number of VGPR slots in the scoring tables for
+// special tokens like SCMEM_LDS (needed for buffer load to LDS).
+enum RegisterMapping {
+ SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
+ AGPR_OFFSET = 256, // Maximum programmable AccVGPRs across all targets.
+ SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+ NUM_EXTRA_VGPRS = 9, // Reserved slot for DS.
+ // Artificial register slots to track LDS writes into specific LDS locations
+ // if a location is known. When slots are exhausted or location is
+ // unknown use the first slot. The first slot is also always updated in
+ // addition to known location's slot to properly generate waits if dependent
+ // instruction's location is unknown.
+ EXTRA_VGPR_LDS = 0,
+ NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
+};
+
+// Enumerate different types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+ // BUF instructions and MIMG instructions without a sampler.
+ VMEM_NOSAMPLER,
+ // MIMG instructions with a sampler.
+ VMEM_SAMPLER,
+ // BVH instructions
+ VMEM_BVH,
+ NUM_VMEM_TYPES
+};
+
+// Maps values of InstCounterType to the instruction that waits on that
+// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
+// returns true.
+static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
+ AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
+ AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+ AMDGPU::S_WAIT_KMCNT};
+
+using WaitCntBitMaskFn = std::function<unsigned(const IsaVersion &)>;
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+ WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+ RegisterEncoding Encoding,
+ const unsigned *WaitEventMaskForInst,
+ InstCounterType SmemAccessCounter)
+ : ST(SubTarget), MaxCounter(MaxCounter), Encoding(Encoding),
+ WaitEventMaskForInst(WaitEventMaskForInst),
+ SmemAccessCounter(SmemAccessCounter) {
+ AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+ for (auto T : inst_counter_types()) {
+ auto Fn = getWaitCntBitMaskFn(T);
+ HardwareLimits[T] = Fn(IV);
+ }
+ }
+
+ unsigned getWaitCountMax(InstCounterType T) const {
+ return HardwareLimits[T];
+ }
+
+ bool isSmemCounter(InstCounterType T) const { return T == SmemAccessCounter; }
+
+ unsigned getScoreLB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreLBs[T];
+ }
+
+ unsigned getScoreUB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreUBs[T];
+ }
+
+ unsigned getScoreRange(InstCounterType T) const {
+ return getScoreUB(T) - getScoreLB(T);
+ }
+
+ unsigned getRegScore(int GprNo, InstCounterType T) const {
+ if (GprNo < NUM_ALL_VGPRS) {
+ return VgprScores[T][GprNo];
+ }
+ assert(isSmemCounter(T));
+ return SgprScores[GprNo - NUM_ALL_VGPRS];
+ }
+
+ bool merge(const WaitcntBrackets &Other);
+
+ RegInterval getRegInterval(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI, unsigned OpNo) const;
+
+ bool counterOutOfOrder(InstCounterType T) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
+ void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+ void applyWaitcnt(InstCounterType T, unsigned Count);
+ void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, WaitEventType E,
+ MachineInstr &MI);
+
+ unsigned hasPendingEvent() const { return PendingEvents; }
+ unsigned hasPendingEvent(WaitEventType E) const {
+ return PendingEvents & (1 << E);
+ }
+ unsigned hasPendingEvent(InstCounterType T) const {
+ unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
+ assert((HasPending != 0) == (getScoreRange(T) != 0));
+ return HasPending;
+ }
+
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = hasPendingEvent(T);
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
+ bool hasPendingFlat() const {
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+ }
+
+ void setPendingFlat() {
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+ }
+
+ // Return true if there might be pending writes to the specified vgpr by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+ assert(GprNo < NUM_ALL_VGPRS);
+ return VgprVmemTypes[GprNo] & ~(1 << V);
+ }
+
+ void clearVgprVmemTypes(int GprNo) {
+ assert(GprNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[GprNo] = 0;
+ }
+
+ void setStateOnFunctionEntryOrReturn() {
+ setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
+ PendingEvents |= WaitEventMaskForInst[STORE_CNT];
+ }
+
+ ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+ return LDSDMAStores;
+ }
+
+ void print(raw_ostream &);
+ void dump() { print(dbgs()); }
+
+private:
+ struct MergeInfo {
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
+ };
+
+ WaitCntBitMaskFn getWaitCntBitMaskFn(InstCounterType T);
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
+
+ void setScoreLB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreLBs[T] = Val;
+ }
+
+ void setScoreUB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreUBs[T] = Val;
+
+ if (T != EXP_CNT)
+ return;
+
+ if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
+ }
+
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+ if (GprNo < NUM_ALL_VGPRS) {
+ VgprUB = std::max(VgprUB, GprNo);
+ VgprScores[T][GprNo] = Val;
+ } else {
+ assert(isSmemCounter(T));
+ SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
+ SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+ }
+ }
+
+ void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+ unsigned OpNo, unsigned Val);
+
+ const GCNSubtarget *ST = nullptr;
+ InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
+ unsigned HardwareLimits[NUM_INST_CNTS] = {0};
+ RegisterEncoding Encoding = {};
+ const unsigned *WaitEventMaskForInst;
+ InstCounterType SmemAccessCounter;
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
+ // Remember the last flat memory operation.
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
+ // wait_cnt scores for every vgpr.
+ // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
+ unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+ // Store representative LDS DMA operations. The only useful info here is
+ // alias info. One store is kept per unique AAInfo.
+ SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
+};
+
+struct BlockInfo {
+ std::unique_ptr<WaitcntBrackets> Incoming;
+ bool Dirty = true;
+};
+
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitCntGenerator {
+protected:
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ AMDGPU::IsaVersion IV;
+ InstCounterType MaxCounter;
+
+public:
+ WaitCntGenerator() {}
+ WaitCntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+ : ST(ST), TII(ST->getInstrInfo()),
+ IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+ // Edits an existing sequence of wait count instructions according
+ // to an incoming Waitcnt value, which is itself updated to reflect
+ // any new wait count instructions which may need to be generated by
+ // WaitCntGenerator::createNewWaitcnt(). It will return true if any edits
+ // were made.
+ //
+ // This editing will usually be merely updated operands, but it may also
+ // delete instructions if the incoming Wait value indicates they are not
+ // needed. It may also remove existing instructions for which a wait
+ // is needed if it can be determined that it is better to generate new
+ // instructions later, as can happen on gfx12.
+ virtual bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const = 0;
+
+ // Generates new wait count instructions according to the value of
+ // Wait, returning true if any new instructions were created.
+ virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) = 0;
+
+ // Returns an array of bit masks which can be used to map values in
+ // WaitEventType to corresponding counter values in InstCounterType.
+ virtual const unsigned *getWaitEventMask() const = 0;
+
+ // Returns a new waitcnt with all counters except VScnt set to 0. If
+ // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
+
+ virtual ~WaitCntGenerator() = default;
+
+ // Transform a soft waitcnt into a normal one.
+ bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
+ unsigned Opcode =
+ SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
+ if (Opcode == Waitcnt->getOpcode())
+ return false;
+
+ Waitcnt->setDesc(TII->get(Opcode));
+ return true;
+ }
+
+ // Create a mask value from the initializer list of wait event types.
+ unsigned eventMask(std::initializer_list<WaitEventType> Events) const {
+ unsigned Mask = 0;
+ for (auto &E : Events)
+ Mask |= 1 << E;
+
+ return Mask;
+ }
+};
+
+class WaitCntGeneratorPreGFX12 : public WaitCntGenerator {
+public:
+ WaitCntGeneratorPreGFX12() {}
+ WaitCntGeneratorPreGFX12(const GCNSubtarget *ST)
+ : WaitCntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
+ bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const override;
+ bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) override;
+
+ const unsigned *getWaitEventMask() const override {
+ assert(ST);
+
+ static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+ eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
+ VMEM_BVH_READ_ACCESS}),
+ eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+ eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+ EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+ eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+ 0,
+ 0,
+ 0};
+
+ return WaitEventMaskForInstPreGFX12;
+ }
+};
+
+class WaitCntGeneratorGFX12Plus : public WaitCntGenerator {
+public:
+ WaitCntGeneratorGFX12Plus() {}
+ WaitCntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
+ : WaitCntGenerator(ST, MaxCounter) {}
+
+ virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
+ bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const override;
+ bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) override;
+
+ const unsigned *getWaitEventMask() const override {
+ assert(ST);
+
+ static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+ eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
+ eventMask({LDS_ACCESS, GDS_ACCESS}),
+ eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+ EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+ eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+ eventMask({VMEM_SAMPLER_READ_ACCESS}),
+ eventMask({VMEM_BVH_READ_ACCESS}),
+ eventMask({SMEM_ACCESS, SQ_MESSAGE})};
+
+ return WaitEventMaskForInstGFX12Plus;
+ }
+};
+
+using VGPRInstsSet = DenseSet<MachineInstr *>;
+
+/// This class provides the abstraction for the wait count insertions in a
+/// function. Virtual methods are provided to handle the waitcnt insertion in a
+/// baisc block for various memory operations as per subtarget requirements.
+class AMDGPUWaitCntInserter {
+public:
+ AMDGPUWaitCntInserter() {}
+ AMDGPUWaitCntInserter(const GCNSubtarget *ST, const MachineRegisterInfo *MRI,
+ WaitCntGenerator *WCG, InstCounterType MC)
+ : ST(ST), TII(ST->getInstrInfo()), TRI(ST->getRegisterInfo()), MRI(MRI),
+ WCG(WCG), MaxCounter(MC) {}
+ virtual ~AMDGPUWaitCntInserter() = default;
+
+ InstCounterType getMaxCounter() const { return MaxCounter; }
+
+ bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ bool generateWaitcnt(AMDGPU::Waitcnt Wait,
+ MachineBasicBlock::instr_iterator It,
+ MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
+ bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
+ bool insertWaitCntsInFunction(MachineFunction &MF, VGPRInstsSet *VGPRInsts);
+
+ virtual bool generateWaitcntInstBefore(MachineInstr &MI,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr,
+ bool FlushVmCnt,
+ VGPRInstsSet *VGPRInsts) = 0;
+
+ virtual bool insertWaitcntInBlock(MachineFunction &MF,
+ MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ VGPRInstsSet *VGPRInsts) = 0;
+
+ virtual void updateEventWaitcntAfter(MachineInstr &Inst,
+ WaitcntBrackets *ScoreBrackets) = 0;
+
+protected:
+ bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
+ bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
+
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+
+ // WCG will point to one of the generator objects of its derived classes,
+ // which must have been re-initialised before use from a value made using a
+ // subtarget constructor.
+ WaitCntGenerator *WCG = nullptr;
+ InstCounterType MaxCounter;
+};
+
+bool isWaitInstr(MachineInstr &Inst);
+VmemType getVmemType(const MachineInstr &Inst);
+bool callWaitsOnFunctionEntry(const MachineInstr &MI);
+bool callWaitsOnFunctionReturn(const MachineInstr &MI);
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E);
+bool readsVCCZ(const MachineInstr &MI);
+bool isCacheInvOrWBInst(MachineInstr &Inst);
+bool updateVMCntOnly(const MachineInstr &Inst);
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count);
+WaitCntGenerator *getWaitCntGenerator(MachineFunction &MF,
+ WaitCntGeneratorPreGFX12 &WCGPreGFX12,
+ WaitCntGeneratorGFX12Plus &WCGGFX12Plus,
+ InstCounterType &MaxCounter);
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUWAITCOUNTUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 19d3b690b1315d..c0b6b293f79e43 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
AMDGPUBaseInfo.cpp
AMDGPUMemoryUtils.cpp
AMDGPUPALMetadata.cpp
+ AMDGPUWaitCountUtils.cpp
AMDKernelCodeTUtils.cpp
LINK_COMPONENTS
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 48f00a82e3e1c6..90d9fd71f6e2cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -130,7 +130,7 @@
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Machine Natural Loop Construction
; GCN-O0-NEXT: MachinePostDominator Tree Construction
-; GCN-O0-NEXT: SI insert wait instructions
+; GCN-O0-NEXT: SI Insert Wait Instructions
; GCN-O0-NEXT: Insert required mode register values
; GCN-O0-NEXT: SI Final Branch Preparation
; GCN-O0-NEXT: Post RA hazard recognizer
@@ -396,7 +396,7 @@
; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: MachinePostDominator Tree Construction
-; GCN-O1-NEXT: SI insert wait instructions
+; GCN-O1-NEXT: SI Insert Wait Instructions
; GCN-O1-NEXT: Insert required mode register values
; GCN-O1-NEXT: SI Insert Hard Clauses
; GCN-O1-NEXT: SI Final Branch Preparation
@@ -693,7 +693,7 @@
; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction
-; GCN-O1-OPTS-NEXT: SI insert wait instructions
+; GCN-O1-OPTS-NEXT: SI Insert Wait Instructions
; GCN-O1-OPTS-NEXT: Insert required mode register values
; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
@@ -996,7 +996,7 @@
; GCN-O2-NEXT: MachineDominator Tree Construction
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: MachinePostDominator Tree Construction
-; GCN-O2-NEXT: SI insert wait instructions
+; GCN-O2-NEXT: SI Insert Wait Instructions
; GCN-O2-NEXT: Insert required mode register values
; GCN-O2-NEXT: SI Insert Hard Clauses
; GCN-O2-NEXT: SI Final Branch Preparation
@@ -1311,7 +1311,7 @@
; GCN-O3-NEXT: MachineDominator Tree Construction
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: MachinePostDominator Tree Construction
-; GCN-O3-NEXT: SI insert wait instructions
+; GCN-O3-NEXT: SI Insert Wait Instructions
; GCN-O3-NEXT: Insert required mode register values
; GCN-O3-NEXT: SI Insert Hard Clauses
; GCN-O3-NEXT: SI Final Branch Preparation
More information about the llvm-commits
mailing list