[llvm-branch-commits] [llvm] [AMDGPU][InsertWaitCnts] Move HWEvent analysis code (PR #202887)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 11 04:55:51 PDT 2026
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/202887
>From 556ad5be5d9d0453c4a433a870fbc3adca9751c1 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 10 Jun 2026 09:40:35 +0200
Subject: [PATCH 1/5] [AMDGPU][InsertWaitCnts] Move HWEvent analysis code
Building up on the previous RFC, if it is accepted:
Move the code that maps a MachineInstr to HWEventSet to a separate file.
This should be NFC.
---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp | 164 ++++++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h | 6 +
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 119 +-------------
3 files changed, 173 insertions(+), 116 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index dfa7006454353..937c8d05a5d1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -7,6 +7,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUHWEvents.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -25,5 +27,167 @@ void HWEventSet::dump() const {
print(dbgs());
dbgs() << "\n";
}
+
+static std::optional<HWEvent>
+getExpertSchedulingEventType(const MachineInstr &Inst, const SIInstrInfo &TII) {
+ if (TII.isVALU(Inst)) {
+ // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
+ // out-of-order with respect to each other, so each of these classes
+ // has its own event.
+
+ if (TII.isXDL(Inst))
+ return HWEvent::VGPR_XDL_WRITE;
+
+ if (TII.isTRANS(Inst))
+ return HWEvent::VGPR_TRANS_WRITE;
+
+ if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
+ return HWEvent::VGPR_DPMACC_WRITE;
+
+ return HWEvent::VGPR_CSMACC_WRITE;
+ }
+
+ // FLAT and LDS instructions may read their VGPR sources out-of-order
+ // with respect to each other and all other VMEM instructions, so
+ // each of these also has a separate event.
+
+ if (TII.isFLAT(Inst))
+ return HWEvent::VGPR_FLAT_READ;
+
+ if (TII.isDS(Inst))
+ return HWEvent::VGPR_LDS_READ;
+
+ if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
+ return HWEvent::VGPR_VMEM_READ;
+
+ // Otherwise, no hazard.
+
+ return {};
+}
+
+static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
+ const SIInstrInfo &TII) {
+ switch (Inst.getOpcode()) {
+ // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
+ case AMDGPU::GLOBAL_INV:
+ return HWEvent::GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't
+ // write VGPRs
+ case AMDGPU::GLOBAL_WB:
+ case AMDGPU::GLOBAL_WBINV:
+ return HWEvent::VMEM_WRITE_ACCESS; // tracked using storecnt
+ default:
+ break;
+ }
+
+ assert(SIInstrInfo::isVMEM(Inst));
+ // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+ // these should use VM_CNT.
+ if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+ return HWEvent::VMEM_ACCESS;
+ if (Inst.mayStore() &&
+ (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
+ if (TII.mayAccessScratch(Inst))
+ return HWEvent::SCRATCH_WRITE_ACCESS;
+ return HWEvent::VMEM_WRITE_ACCESS;
+ }
+ if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+ return HWEvent::VMEM_ACCESS;
+
+ if (SIInstrInfo::isImage(Inst)) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+
+ if (BaseInfo->BVH)
+ return HWEvent::VMEM_BVH_READ_ACCESS;
+
+ // We have to make an additional check for isVSAMPLE here since some
+ // instructions don't have a sampler, but are still classified as sampler
+ // instructions for the purposes of e.g. waitcnt.
+ if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
+ return HWEvent::VMEM_SAMPLER_READ_ACCESS;
+ }
+
+ return HWEvent::VMEM_ACCESS;
+}
+
+HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode) {
+ const SIInstrInfo &TII = *ST.getInstrInfo();
+
+ HWEventSet Events;
+ if (IsExpertMode) {
+ if (const auto ET = getExpertSchedulingEventType(Inst, TII))
+ Events.insert(*ET);
+ }
+
+ if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
+ if (TII.isAlwaysGDS(Inst.getOpcode()) ||
+ TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+ Events.insert(HWEvent::GDS_ACCESS);
+ Events.insert(HWEvent::GDS_GPR_LOCK);
+ } else {
+ Events.insert(HWEvent::LDS_ACCESS);
+ }
+ } else if (TII.isFLAT(Inst)) {
+ if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
+ Events.insert(getVmemHWEvent(Inst, ST, TII));
+ } else {
+ assert(Inst.mayLoadOrStore());
+ if (TII.mayAccessVMEMThroughFlat(Inst)) {
+ if (ST.hasWaitXcnt())
+ Events.insert(HWEvent::VMEM_GROUP);
+ Events.insert(getVmemHWEvent(Inst, ST, TII));
+ }
+ if (TII.mayAccessLDSThroughFlat(Inst))
+ Events.insert(HWEvent::LDS_ACCESS);
+ }
+ } else if (SIInstrInfo::isVMEM(Inst) &&
+ (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
+ Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
+ // BUFFER_WBL2 is included here because unlike invalidates, has to be
+ // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
+ // completed.
+ if (ST.hasWaitXcnt())
+ Events.insert(HWEvent::VMEM_GROUP);
+ Events.insert(getVmemHWEvent(Inst, ST, TII));
+ if (ST.vmemWriteNeedsExpWaitcnt() &&
+ (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
+ Events.insert(HWEvent::VMW_GPR_LOCK);
+ }
+ } else if (TII.isSMRD(Inst)) {
+ if (ST.hasWaitXcnt())
+ Events.insert(HWEvent::SMEM_GROUP);
+ Events.insert(HWEvent::SMEM_ACCESS);
+ } else if (SIInstrInfo::isLDSDIR(Inst)) {
+ Events.insert(HWEvent::EXP_LDS_ACCESS);
+ } else if (SIInstrInfo::isEXP(Inst)) {
+ unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+ if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
+ Events.insert(HWEvent::EXP_PARAM_ACCESS);
+ else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
+ Events.insert(HWEvent::EXP_POS_ACCESS);
+ else
+ Events.insert(HWEvent::EXP_GPR_LOCK);
+ } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
+ Events.insert(HWEvent::SCC_WRITE);
+ } else {
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSG_RTN_B32:
+ case AMDGPU::S_SENDMSG_RTN_B64:
+ case AMDGPU::S_SENDMSGHALT:
+ Events.insert(HWEvent::SQ_MESSAGE);
+ break;
+ case AMDGPU::S_MEMTIME:
+ case AMDGPU::S_MEMREALTIME:
+ case AMDGPU::S_GET_BARRIER_STATE_M0:
+ case AMDGPU::S_GET_BARRIER_STATE_IMM:
+ Events.insert(HWEvent::SMEM_ACCESS);
+ break;
+ }
+ }
+ return Events;
+}
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
index 81487c6b20957..2ed1a2a361d1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
@@ -13,6 +13,8 @@
#include "llvm/ADT/StringRef.h"
namespace llvm {
+class GCNSubtarget;
+class MachineInstr;
class raw_ostream;
namespace AMDGPU {
@@ -116,6 +118,10 @@ class HWEventSet {
LLVM_DUMP_METHOD void dump() const;
};
+/// \returns all HWEvents triggered by \p Inst
+HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode);
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index acca2e1240635..3bc4357471169 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -509,42 +509,6 @@ class SIInsertWaitcnts {
#endif // NDEBUG
}
- // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
- // instruction.
- HWEvent getVmemHWEvent(const MachineInstr &Inst) const {
- switch (Inst.getOpcode()) {
- // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
- case AMDGPU::GLOBAL_INV:
- return HWEvent::GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't
- // write VGPRs
- case AMDGPU::GLOBAL_WB:
- case AMDGPU::GLOBAL_WBINV:
- return HWEvent::VMEM_WRITE_ACCESS; // tracked using storecnt
- default:
- break;
- }
-
- // Maps VMEM access types to their corresponding HWEvent.
- static const HWEvent VmemReadMapping[NUM_VMEM_TYPES] = {
- HWEvent::VMEM_ACCESS, HWEvent::VMEM_SAMPLER_READ_ACCESS,
- HWEvent::VMEM_BVH_READ_ACCESS};
-
- assert(SIInstrInfo::isVMEM(Inst));
- // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
- // these should use VM_CNT.
- if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
- return HWEvent::VMEM_ACCESS;
- if (Inst.mayStore() &&
- (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
- if (TII.mayAccessScratch(Inst))
- return HWEvent::SCRATCH_WRITE_ACCESS;
- return HWEvent::VMEM_WRITE_ACCESS;
- }
- if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
- return HWEvent::VMEM_ACCESS;
- return VmemReadMapping[getVmemType(Inst)];
- }
-
std::optional<HWEvent>
getExpertSchedulingEventType(const MachineInstr &Inst) const;
@@ -586,8 +550,6 @@ class SIInsertWaitcnts {
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr);
- /// \returns all events that correspond to \p Inst.
- HWEventSet getEventsFor(const MachineInstr &Inst) const;
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1567,7 +1529,8 @@ MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
return Reg32;
// If hi/lo16 mixed events
- HWEventSet MIEvents = Context->getEventsFor(MI);
+ HWEventSet MIEvents =
+ AMDGPU::getEventsFor(MI, Context->ST, Context->IsExpertMode);
HWEventSet OtherHalfEvents = Context->getWaitEvents(T);
HWEventSet Events = MIEvents & OtherHalfEvents;
if (Events.twoOrMore())
@@ -2840,86 +2803,10 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
return Result;
}
-HWEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
- HWEventSet Events;
- if (IsExpertMode) {
- if (const auto ET = getExpertSchedulingEventType(Inst))
- Events.insert(*ET);
- }
-
- if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
- if (TII.isAlwaysGDS(Inst.getOpcode()) ||
- TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
- Events.insert(HWEvent::GDS_ACCESS);
- Events.insert(HWEvent::GDS_GPR_LOCK);
- } else {
- Events.insert(HWEvent::LDS_ACCESS);
- }
- } else if (TII.isFLAT(Inst)) {
- if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
- Events.insert(getVmemHWEvent(Inst));
- } else {
- assert(Inst.mayLoadOrStore());
- if (TII.mayAccessVMEMThroughFlat(Inst)) {
- if (ST.hasWaitXcnt())
- Events.insert(HWEvent::VMEM_GROUP);
- Events.insert(getVmemHWEvent(Inst));
- }
- if (TII.mayAccessLDSThroughFlat(Inst))
- Events.insert(HWEvent::LDS_ACCESS);
- }
- } else if (SIInstrInfo::isVMEM(Inst) &&
- (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
- Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
- // BUFFER_WBL2 is included here because unlike invalidates, has to be
- // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
- // completed.
- if (ST.hasWaitXcnt())
- Events.insert(HWEvent::VMEM_GROUP);
- Events.insert(getVmemHWEvent(Inst));
- if (ST.vmemWriteNeedsExpWaitcnt() &&
- (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
- Events.insert(HWEvent::VMW_GPR_LOCK);
- }
- } else if (TII.isSMRD(Inst)) {
- if (ST.hasWaitXcnt())
- Events.insert(HWEvent::SMEM_GROUP);
- Events.insert(HWEvent::SMEM_ACCESS);
- } else if (SIInstrInfo::isLDSDIR(Inst)) {
- Events.insert(HWEvent::EXP_LDS_ACCESS);
- } else if (SIInstrInfo::isEXP(Inst)) {
- unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
- if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
- Events.insert(HWEvent::EXP_PARAM_ACCESS);
- else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
- Events.insert(HWEvent::EXP_POS_ACCESS);
- else
- Events.insert(HWEvent::EXP_GPR_LOCK);
- } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
- Events.insert(HWEvent::SCC_WRITE);
- } else {
- switch (Inst.getOpcode()) {
- case AMDGPU::S_SENDMSG:
- case AMDGPU::S_SENDMSG_RTN_B32:
- case AMDGPU::S_SENDMSG_RTN_B64:
- case AMDGPU::S_SENDMSGHALT:
- Events.insert(HWEvent::SQ_MESSAGE);
- break;
- case AMDGPU::S_MEMTIME:
- case AMDGPU::S_MEMREALTIME:
- case AMDGPU::S_GET_BARRIER_STATE_M0:
- case AMDGPU::S_GET_BARRIER_STATE_IMM:
- Events.insert(HWEvent::SMEM_ACCESS);
- break;
- }
- }
- return Events;
-}
-
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
- HWEventSet InstEvents = getEventsFor(Inst);
+ HWEventSet InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
for (HWEvent E : AMDGPU::hw_events()) {
if (InstEvents.contains(E))
ScoreBrackets->updateByEvent(E, Inst);
>From 9f43acacc07835a976c57336626686b192cc016e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 10 Jun 2026 11:32:45 +0200
Subject: [PATCH 2/5] Comment
---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp | 25 ++++++++++-------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index 937c8d05a5d1c..176473a3f9a02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -28,23 +28,23 @@ void HWEventSet::dump() const {
dbgs() << "\n";
}
-static std::optional<HWEvent>
-getExpertSchedulingEventType(const MachineInstr &Inst, const SIInstrInfo &TII) {
+static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
+ const SIInstrInfo &TII) {
if (TII.isVALU(Inst)) {
// Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
// out-of-order with respect to each other, so each of these classes
// has its own event.
if (TII.isXDL(Inst))
- return HWEvent::VGPR_XDL_WRITE;
+ return {HWEvent::VGPR_XDL_WRITE};
if (TII.isTRANS(Inst))
- return HWEvent::VGPR_TRANS_WRITE;
+ return {HWEvent::VGPR_TRANS_WRITE};
if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
- return HWEvent::VGPR_DPMACC_WRITE;
+ return {HWEvent::VGPR_DPMACC_WRITE};
- return HWEvent::VGPR_CSMACC_WRITE;
+ return {HWEvent::VGPR_CSMACC_WRITE};
}
// FLAT and LDS instructions may read their VGPR sources out-of-order
@@ -52,16 +52,15 @@ getExpertSchedulingEventType(const MachineInstr &Inst, const SIInstrInfo &TII) {
// each of these also has a separate event.
if (TII.isFLAT(Inst))
- return HWEvent::VGPR_FLAT_READ;
+ return {HWEvent::VGPR_FLAT_READ};
if (TII.isDS(Inst))
- return HWEvent::VGPR_LDS_READ;
+ return {HWEvent::VGPR_LDS_READ};
if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
- return HWEvent::VGPR_VMEM_READ;
+ return {HWEvent::VGPR_VMEM_READ};
// Otherwise, no hazard.
-
return {};
}
@@ -116,10 +115,8 @@ HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
const SIInstrInfo &TII = *ST.getInstrInfo();
HWEventSet Events;
- if (IsExpertMode) {
- if (const auto ET = getExpertSchedulingEventType(Inst, TII))
- Events.insert(*ET);
- }
+ if (IsExpertMode)
+ Events |= getExpertSchedulingEventType(Inst, TII);
if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
if (TII.isAlwaysGDS(Inst.getOpcode()) ||
>From 4482096b1e266fb2f781012a49e61f8a3c5c5ce6 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 10 Jun 2026 11:41:22 +0200
Subject: [PATCH 3/5] delete function i missed
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 40 ---------------------
1 file changed, 40 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3bc4357471169..b480906aab79f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -509,9 +509,6 @@ class SIInsertWaitcnts {
#endif // NDEBUG
}
- std::optional<HWEvent>
- getExpertSchedulingEventType(const MachineInstr &Inst) const;
-
bool isAsync(const MachineInstr &MI) const {
if (!SIInstrInfo::isLDSDMA(MI))
return false;
@@ -2703,43 +2700,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
return Modified;
}
-std::optional<HWEvent>
-SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
- if (TII.isVALU(Inst)) {
- // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
- // out-of-order with respect to each other, so each of these classes
- // has its own event.
-
- if (TII.isXDL(Inst))
- return HWEvent::VGPR_XDL_WRITE;
-
- if (TII.isTRANS(Inst))
- return HWEvent::VGPR_TRANS_WRITE;
-
- if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
- return HWEvent::VGPR_DPMACC_WRITE;
-
- return HWEvent::VGPR_CSMACC_WRITE;
- }
-
- // FLAT and LDS instructions may read their VGPR sources out-of-order
- // with respect to each other and all other VMEM instructions, so
- // each of these also has a separate event.
-
- if (TII.isFLAT(Inst))
- return HWEvent::VGPR_FLAT_READ;
-
- if (TII.isDS(Inst))
- return HWEvent::VGPR_LDS_READ;
-
- if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
- return HWEvent::VGPR_VMEM_READ;
-
- // Otherwise, no hazard.
-
- return {};
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
(TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
>From 0cdf26db21afb0f9ef6f73e0cc6bcf700501a9fd Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 10 Jun 2026 13:22:05 +0200
Subject: [PATCH 4/5] Comments
---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp | 142 ++++++++++++----------
1 file changed, 79 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index 176473a3f9a02..caef87afbe740 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -110,81 +110,97 @@ static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
return HWEvent::VMEM_ACCESS;
}
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
- bool IsExpertMode) {
- const SIInstrInfo &TII = *ST.getInstrInfo();
-
- HWEventSet Events;
- if (IsExpertMode)
- Events |= getExpertSchedulingEventType(Inst, TII);
-
+static HWEventSet getEventsForImpl(const MachineInstr &Inst,
+ const GCNSubtarget &ST,
+ const SIInstrInfo &TII) {
if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
if (TII.isAlwaysGDS(Inst.getOpcode()) ||
- TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
- Events.insert(HWEvent::GDS_ACCESS);
- Events.insert(HWEvent::GDS_GPR_LOCK);
- } else {
- Events.insert(HWEvent::LDS_ACCESS);
- }
- } else if (TII.isFLAT(Inst)) {
- if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
- Events.insert(getVmemHWEvent(Inst, ST, TII));
- } else {
- assert(Inst.mayLoadOrStore());
- if (TII.mayAccessVMEMThroughFlat(Inst)) {
- if (ST.hasWaitXcnt())
- Events.insert(HWEvent::VMEM_GROUP);
- Events.insert(getVmemHWEvent(Inst, ST, TII));
- }
- if (TII.mayAccessLDSThroughFlat(Inst))
- Events.insert(HWEvent::LDS_ACCESS);
+ TII.hasModifiersSet(Inst, AMDGPU::OpName::gds))
+ return {HWEvent::GDS_ACCESS, HWEvent::GDS_GPR_LOCK};
+
+ return {HWEvent::LDS_ACCESS};
+ }
+
+ if (TII.isFLAT(Inst)) {
+ if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode()))
+ return {getVmemHWEvent(Inst, ST, TII)};
+
+ assert(Inst.mayLoadOrStore());
+ HWEventSet S;
+ if (TII.mayAccessVMEMThroughFlat(Inst)) {
+ if (ST.hasWaitXcnt())
+ S.insert(HWEvent::VMEM_GROUP);
+ S.insert(getVmemHWEvent(Inst, ST, TII));
}
- } else if (SIInstrInfo::isVMEM(Inst) &&
- (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
- Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
+
+ if (TII.mayAccessLDSThroughFlat(Inst))
+ S.insert(HWEvent::LDS_ACCESS);
+ return S;
+ }
+
+ if (SIInstrInfo::isVMEM(Inst) &&
+ (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
+ Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
// BUFFER_WBL2 is included here because unlike invalidates, has to be
// followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
// completed.
+ HWEventSet S = {getVmemHWEvent(Inst, ST, TII)};
if (ST.hasWaitXcnt())
- Events.insert(HWEvent::VMEM_GROUP);
- Events.insert(getVmemHWEvent(Inst, ST, TII));
+ S.insert(HWEvent::VMEM_GROUP);
if (ST.vmemWriteNeedsExpWaitcnt() &&
- (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
- Events.insert(HWEvent::VMW_GPR_LOCK);
- }
- } else if (TII.isSMRD(Inst)) {
+ (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst)))
+ S.insert(HWEvent::VMW_GPR_LOCK);
+
+ return S;
+ }
+
+ if (TII.isSMRD(Inst)) {
if (ST.hasWaitXcnt())
- Events.insert(HWEvent::SMEM_GROUP);
- Events.insert(HWEvent::SMEM_ACCESS);
- } else if (SIInstrInfo::isLDSDIR(Inst)) {
- Events.insert(HWEvent::EXP_LDS_ACCESS);
- } else if (SIInstrInfo::isEXP(Inst)) {
+ return {HWEvent::SMEM_GROUP, HWEvent::SMEM_ACCESS};
+ return {HWEvent::SMEM_ACCESS};
+ }
+
+ if (SIInstrInfo::isLDSDIR(Inst)) {
+ return {HWEvent::EXP_LDS_ACCESS};
+ }
+
+ if (SIInstrInfo::isEXP(Inst)) {
unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
- Events.insert(HWEvent::EXP_PARAM_ACCESS);
- else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
- Events.insert(HWEvent::EXP_POS_ACCESS);
- else
- Events.insert(HWEvent::EXP_GPR_LOCK);
- } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
- Events.insert(HWEvent::SCC_WRITE);
- } else {
- switch (Inst.getOpcode()) {
- case AMDGPU::S_SENDMSG:
- case AMDGPU::S_SENDMSG_RTN_B32:
- case AMDGPU::S_SENDMSG_RTN_B64:
- case AMDGPU::S_SENDMSGHALT:
- Events.insert(HWEvent::SQ_MESSAGE);
- break;
- case AMDGPU::S_MEMTIME:
- case AMDGPU::S_MEMREALTIME:
- case AMDGPU::S_GET_BARRIER_STATE_M0:
- case AMDGPU::S_GET_BARRIER_STATE_IMM:
- Events.insert(HWEvent::SMEM_ACCESS);
- break;
- }
+ return {HWEvent::EXP_PARAM_ACCESS};
+ if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
+ return {HWEvent::EXP_POS_ACCESS};
+ return {HWEvent::EXP_GPR_LOCK};
+ }
+
+ if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
+ return {HWEvent::SCC_WRITE};
+ }
+
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_SENDMSG:
+ case AMDGPU::S_SENDMSG_RTN_B32:
+ case AMDGPU::S_SENDMSG_RTN_B64:
+ case AMDGPU::S_SENDMSGHALT:
+ return {HWEvent::SQ_MESSAGE};
+ case AMDGPU::S_MEMTIME:
+ case AMDGPU::S_MEMREALTIME:
+ case AMDGPU::S_GET_BARRIER_STATE_M0:
+ case AMDGPU::S_GET_BARRIER_STATE_IMM:
+ return {HWEvent::SMEM_ACCESS};
}
- return Events;
+
+ return {};
+}
+
+HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode) {
+ const SIInstrInfo &TII = *ST.getInstrInfo();
+
+ if (IsExpertMode)
+ return getEventsForImpl(Inst, ST, TII) |
+ getExpertSchedulingEventType(Inst, TII);
+ return getEventsForImpl(Inst, ST, TII);
}
} // namespace AMDGPU
} // namespace llvm
>From 8d79c5c526e5acc9862d5f93801253b8d224ade5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 11 Jun 2026 13:49:55 +0200
Subject: [PATCH 5/5] Make HWEventSet ctor non-explicit
---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp | 34 +++++++++++------------
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h | 2 +-
2 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index caef87afbe740..82be5059e7443 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -36,15 +36,15 @@ static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
// has its own event.
if (TII.isXDL(Inst))
- return {HWEvent::VGPR_XDL_WRITE};
+ return HWEvent::VGPR_XDL_WRITE;
if (TII.isTRANS(Inst))
- return {HWEvent::VGPR_TRANS_WRITE};
+ return HWEvent::VGPR_TRANS_WRITE;
if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
- return {HWEvent::VGPR_DPMACC_WRITE};
+ return HWEvent::VGPR_DPMACC_WRITE;
- return {HWEvent::VGPR_CSMACC_WRITE};
+ return HWEvent::VGPR_CSMACC_WRITE;
}
// FLAT and LDS instructions may read their VGPR sources out-of-order
@@ -52,13 +52,13 @@ static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
// each of these also has a separate event.
if (TII.isFLAT(Inst))
- return {HWEvent::VGPR_FLAT_READ};
+ return HWEvent::VGPR_FLAT_READ;
if (TII.isDS(Inst))
- return {HWEvent::VGPR_LDS_READ};
+ return HWEvent::VGPR_LDS_READ;
if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
- return {HWEvent::VGPR_VMEM_READ};
+ return HWEvent::VGPR_VMEM_READ;
// Otherwise, no hazard.
return {};
@@ -118,12 +118,12 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
TII.hasModifiersSet(Inst, AMDGPU::OpName::gds))
return {HWEvent::GDS_ACCESS, HWEvent::GDS_GPR_LOCK};
- return {HWEvent::LDS_ACCESS};
+ return HWEvent::LDS_ACCESS;
}
if (TII.isFLAT(Inst)) {
if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode()))
- return {getVmemHWEvent(Inst, ST, TII)};
+ return getVmemHWEvent(Inst, ST, TII);
assert(Inst.mayLoadOrStore());
HWEventSet S;
@@ -157,24 +157,24 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
if (TII.isSMRD(Inst)) {
if (ST.hasWaitXcnt())
return {HWEvent::SMEM_GROUP, HWEvent::SMEM_ACCESS};
- return {HWEvent::SMEM_ACCESS};
+ return HWEvent::SMEM_ACCESS;
}
if (SIInstrInfo::isLDSDIR(Inst)) {
- return {HWEvent::EXP_LDS_ACCESS};
+ return HWEvent::EXP_LDS_ACCESS;
}
if (SIInstrInfo::isEXP(Inst)) {
unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
- return {HWEvent::EXP_PARAM_ACCESS};
+ return HWEvent::EXP_PARAM_ACCESS;
if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
- return {HWEvent::EXP_POS_ACCESS};
- return {HWEvent::EXP_GPR_LOCK};
+ return HWEvent::EXP_POS_ACCESS;
+ return HWEvent::EXP_GPR_LOCK;
}
if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
- return {HWEvent::SCC_WRITE};
+ return HWEvent::SCC_WRITE;
}
switch (Inst.getOpcode()) {
@@ -182,12 +182,12 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
case AMDGPU::S_SENDMSG_RTN_B32:
case AMDGPU::S_SENDMSG_RTN_B64:
case AMDGPU::S_SENDMSGHALT:
- return {HWEvent::SQ_MESSAGE};
+ return HWEvent::SQ_MESSAGE;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
- return {HWEvent::SMEM_ACCESS};
+ return HWEvent::SMEM_ACCESS;
}
return {};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
index 2ed1a2a361d1b..5fad398fa56b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
@@ -61,7 +61,7 @@ class HWEventSet {
public:
HWEventSet() = default;
- explicit constexpr HWEventSet(HWEvent Event) {
+ constexpr HWEventSet(HWEvent Event) {
static_assert(static_cast<unsigned>(HWEvent::NUM_WAIT_EVENTS) <=
sizeof(Mask) * 8,
"Not enough bits in Mask for all the events");
More information about the llvm-branch-commits
mailing list