[llvm-branch-commits] [llvm] [NFC][AMDGPU][InsertWaitCnts] Move some simple functions into Utils (PR #202936)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 11 04:15:51 PDT 2026
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/202936
>From 419a3ea395f7eb13ce5564fdd7893d77469cea43 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 10 Jun 2026 12:09:49 +0200
Subject: [PATCH] [NFC][AMDGPU][InsertWaitCnts] Move some simple functions into
Utils
Move really trivial functions into helpers to declutter InsertWaitCnt a bit more.
I had to move HardwareLimits into a different header but it's only used in InsertWaitCnt so it doesn't matter.
---
llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp | 75 ++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h | 32 ++++++
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 107 ++++--------------
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 20 ----
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 20 ----
5 files changed, 128 insertions(+), 126 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
index df8d22fb5e3dd..75e757d0c8b2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUWaitcntUtils.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
namespace llvm::AMDGPU {
@@ -47,6 +48,53 @@ StringLiteral getInstCounterName(InstCounterType T) {
llvm_unreachable("Unhandled InstCounterType");
}
+HardwareLimits::HardwareLimits(const IsaVersion &IV) {
+ bool HasExtendedWaitCounts = IV.Major >= 12;
+ if (HasExtendedWaitCounts) {
+ LoadcntMax = getLoadcntBitMask(IV);
+ DscntMax = getDscntBitMask(IV);
+ } else {
+ LoadcntMax = getVmcntBitMask(IV);
+ DscntMax = getLgkmcntBitMask(IV);
+ }
+ ExpcntMax = getExpcntBitMask(IV);
+ StorecntMax = getStorecntBitMask(IV);
+ SamplecntMax = getSamplecntBitMask(IV);
+ BvhcntMax = getBvhcntBitMask(IV);
+ KmcntMax = getKmcntBitMask(IV);
+ XcntMax = getXcntBitMask(IV);
+ AsyncMax = getAsynccntBitMask(IV);
+ VaVdstMax = DepCtr::getVaVdstBitMask();
+ VmVsrcMax = DepCtr::getVmVsrcBitMask();
+}
+
+unsigned HardwareLimits::get(InstCounterType T) const {
+ switch (T) {
+ case AMDGPU::LOAD_CNT:
+ return LoadcntMax;
+ case AMDGPU::DS_CNT:
+ return DscntMax;
+ case AMDGPU::EXP_CNT:
+ return ExpcntMax;
+ case AMDGPU::STORE_CNT:
+ return StorecntMax;
+ case AMDGPU::SAMPLE_CNT:
+ return SamplecntMax;
+ case AMDGPU::BVH_CNT:
+ return BvhcntMax;
+ case AMDGPU::KM_CNT:
+ return KmcntMax;
+ case AMDGPU::X_CNT:
+ return XcntMax;
+ case AMDGPU::VA_VDST:
+ return VaVdstMax;
+ case AMDGPU::VM_VSRC:
+ return VmVsrcMax;
+ default:
+ return 0;
+ }
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void Waitcnt::dump() const { dbgs() << *this << '\n'; }
#endif
@@ -89,4 +137,31 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version,
Decoded.get(DS_CNT));
}
+std::optional<AMDGPU::InstCounterType> counterTypeForInstr(unsigned Opcode) {
+ switch (Opcode) {
+ case AMDGPU::S_WAIT_LOADCNT:
+ return AMDGPU::LOAD_CNT;
+ case AMDGPU::S_WAIT_EXPCNT:
+ return AMDGPU::EXP_CNT;
+ case AMDGPU::S_WAIT_STORECNT:
+ return AMDGPU::STORE_CNT;
+ case AMDGPU::S_WAIT_SAMPLECNT:
+ return AMDGPU::SAMPLE_CNT;
+ case AMDGPU::S_WAIT_BVHCNT:
+ return AMDGPU::BVH_CNT;
+ case AMDGPU::S_WAIT_DSCNT:
+ return AMDGPU::DS_CNT;
+ case AMDGPU::S_WAIT_KMCNT:
+ return AMDGPU::KM_CNT;
+ case AMDGPU::S_WAIT_XCNT:
+ return AMDGPU::X_CNT;
+ case AMDGPU::S_WAIT_ASYNCCNT:
+ return AMDGPU::ASYNC_CNT;
+ case AMDGPU::S_WAIT_TENSORCNT:
+ return AMDGPU::TENSOR_CNT;
+ default:
+ return {};
+ }
+}
+
} // namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
index 093d8a45d207b..0930a95288087 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
@@ -46,6 +46,28 @@ StringLiteral getInstCounterName(InstCounterType T);
iota_range<InstCounterType>
inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS);
+/// Represents the hardware counter limits for different wait count types.
+struct HardwareLimits {
+ unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
+ unsigned ExpcntMax;
+ unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
+ unsigned SamplecntMax; // gfx12+ only.
+ unsigned BvhcntMax; // gfx12+ only.
+ unsigned KmcntMax; // gfx12+ only.
+ unsigned XcntMax; // gfx1250.
+ unsigned AsyncMax; // gfx1250.
+ unsigned VaVdstMax; // gfx12+ expert mode only.
+ unsigned VmVsrcMax; // gfx12+ expert mode only.
+
+ HardwareLimits() = default;
+
+ /// Initializes hardware limits from ISA version.
+ HardwareLimits(const IsaVersion &IV);
+
+ unsigned get(InstCounterType T) const;
+};
+
} // namespace AMDGPU
template <> struct enum_iteration_traits<AMDGPU::InstCounterType> {
@@ -109,6 +131,12 @@ class Waitcnt {
return false;
}
+ void add(AMDGPU::InstCounterType T, unsigned Count) {
+ set(T, std::min(get(T), Count));
+ }
+
+ void clear(AMDGPU::InstCounterType T) { set(T, ~0u); }
+
bool hasWaitStoreCnt() const { return Cnt[STORE_CNT] != ~0u; }
bool hasWaitDepctr() const {
@@ -168,6 +196,10 @@ unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
/// \p Version.
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
+/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
+/// and if so, which counter it is waiting on.
+std::optional<AMDGPU::InstCounterType> counterTypeForInstr(unsigned Opcode);
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1bb6950d4f574..f13d401f70fa2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -74,34 +74,6 @@ static cl::opt<bool> ExpertSchedulingModeFlag(
cl::init(false), cl::Hidden);
namespace {
-// Get the maximum wait count value for a given counter type.
-static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
- AMDGPU::InstCounterType T) {
- switch (T) {
- case AMDGPU::LOAD_CNT:
- return Limits.LoadcntMax;
- case AMDGPU::DS_CNT:
- return Limits.DscntMax;
- case AMDGPU::EXP_CNT:
- return Limits.ExpcntMax;
- case AMDGPU::STORE_CNT:
- return Limits.StorecntMax;
- case AMDGPU::SAMPLE_CNT:
- return Limits.SamplecntMax;
- case AMDGPU::BVH_CNT:
- return Limits.BvhcntMax;
- case AMDGPU::KM_CNT:
- return Limits.KmcntMax;
- case AMDGPU::X_CNT:
- return Limits.XcntMax;
- case AMDGPU::VA_VDST:
- return Limits.VaVdstMax;
- case AMDGPU::VM_VSRC:
- return Limits.VmVsrcMax;
- default:
- return 0;
- }
-}
/// Integer IDs used to track vector memory locations we may have to wait on.
/// Encoded as u16 chunks:
@@ -211,13 +183,6 @@ VmemType getVmemType(const MachineInstr &Inst) {
return VMEM_NOSAMPLER;
}
-void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
- Wait.set(T, std::min(Wait.get(T), Count));
-}
-
-void setNoWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) {
- Wait.set(T, ~0u);
-}
class WaitcntBrackets;
// This abstracts the logic for generating and updating S_WAIT* instructions
@@ -701,7 +666,7 @@ class WaitcntBrackets {
unsigned getPendingGDSWait() const {
return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
- getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
+ Context->getLimits().get(AMDGPU::DS_CNT) - 1);
}
void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
@@ -730,7 +695,7 @@ class WaitcntBrackets {
void setStateOnFunctionEntryOrReturn() {
setScoreUB(AMDGPU::STORE_CNT,
getScoreUB(AMDGPU::STORE_CNT) +
- getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
+ Context->getLimits().get(AMDGPU::STORE_CNT));
PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
}
@@ -787,10 +752,9 @@ class WaitcntBrackets {
return;
if (getScoreRange(AMDGPU::EXP_CNT) >
- getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
+ Context->getLimits().get(AMDGPU::EXP_CNT))
ScoreLBs[AMDGPU::EXP_CNT] =
- ScoreUBs[AMDGPU::EXP_CNT] -
- getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
+ ScoreUBs[AMDGPU::EXP_CNT] - Context->getLimits().get(AMDGPU::EXP_CNT);
}
void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
@@ -1423,18 +1387,18 @@ void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
// to force a waitcnt 0.
- addWait(Wait, T, 0);
+ Wait.add(T, 0);
} else if (counterOutOfOrder(T)) {
// Counter can get decremented out-of-order when there
// are multiple types event in the bracket. Also emit an s_wait counter
// with a conservative value of 0 for the counter.
- addWait(Wait, T, 0);
+ Wait.add(T, 0);
} else {
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
- unsigned NeededWait = std::min(
- UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
- addWait(Wait, T, NeededWait);
+ unsigned NeededWait =
+ std::min(UB - ScoreToWait, Context->getLimits().get(T) - 1);
+ Wait.add(T, NeededWait);
}
}
}
@@ -1662,36 +1626,6 @@ static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
return true;
}
-/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
-/// and if so, which counter it is waiting on.
-static std::optional<AMDGPU::InstCounterType>
-counterTypeForInstr(unsigned Opcode) {
- switch (Opcode) {
- case AMDGPU::S_WAIT_LOADCNT:
- return AMDGPU::LOAD_CNT;
- case AMDGPU::S_WAIT_EXPCNT:
- return AMDGPU::EXP_CNT;
- case AMDGPU::S_WAIT_STORECNT:
- return AMDGPU::STORE_CNT;
- case AMDGPU::S_WAIT_SAMPLECNT:
- return AMDGPU::SAMPLE_CNT;
- case AMDGPU::S_WAIT_BVHCNT:
- return AMDGPU::BVH_CNT;
- case AMDGPU::S_WAIT_DSCNT:
- return AMDGPU::DS_CNT;
- case AMDGPU::S_WAIT_KMCNT:
- return AMDGPU::KM_CNT;
- case AMDGPU::S_WAIT_XCNT:
- return AMDGPU::X_CNT;
- case AMDGPU::S_WAIT_ASYNCCNT:
- return AMDGPU::ASYNC_CNT;
- case AMDGPU::S_WAIT_TENSORCNT:
- return AMDGPU::TENSOR_CNT;
- default:
- return {};
- }
-}
-
bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
if (Opcode == Waitcnt->getOpcode())
@@ -1880,7 +1814,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
continue;
unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
- getWaitCountMax(getLimits(), CT) - 1);
+ getLimits().get(CT) - 1);
EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
AMDGPU::Waitcnt W;
W.set(CT, Count);
@@ -1910,7 +1844,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
// Only expand if counter is not out-of-order
unsigned Outstanding =
std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
- getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
+ getLimits().get(AMDGPU::STORE_CNT) - 1);
EmitExpandedWaitcnt(
Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
@@ -2064,14 +1998,15 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
Wait = Wait.combined(OldWait);
} else {
- std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
+ std::optional<AMDGPU::InstCounterType> CT =
+ AMDGPU::counterTypeForInstr(Opcode);
assert(CT.has_value());
unsigned OldCnt =
TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
if (TrySimplify)
- addWait(Wait, CT.value(), OldCnt);
+ Wait.add(CT.value(), OldCnt);
else
- addWait(RequiredWait, CT.value(), OldCnt);
+ RequiredWait.add(CT.value(), OldCnt);
// Keep the first wait of its kind, erase the rest.
if (WaitInstrs[CT.value()] == nullptr) {
WaitInstrs[CT.value()] = &II;
@@ -2188,7 +2123,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
ScoreBrackets.applyWaitcnt(CT, NewCnt);
- setNoWait(Wait, CT);
+ Wait.clear(CT);
LLVM_DEBUG(It.isEnd()
? dbgs() << "applied pre-existing waitcnt\n"
@@ -2272,8 +2207,8 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
continue;
}
- unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
- getWaitCountMax(getLimits(), CT) - 1);
+ unsigned Outstanding =
+ std::min(ScoreBrackets.getOutstanding(CT), getLimits().get(CT) - 1);
EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
.addImm(Val);
@@ -2455,7 +2390,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// Wait for any pending GDS instruction to complete before any
// "Always GDS" instruction.
if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
- addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
+ Wait.add(AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
if (MI.isCall()) {
// The function is going to insert a wait on everything in its prolog.
@@ -2496,7 +2431,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
const Value *Ptr = Memop->getValue();
if (Memop->isStore()) {
if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
- addWait(Wait, SmemAccessCounter, 0);
+ Wait.add(SmemAccessCounter, 0);
if (PDT.dominates(MI.getParent(), It->second))
SLoadAddresses.erase(It);
}
@@ -2993,7 +2928,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
Opcode == AMDGPU::S_WAITCNT_lds_direct ||
Opcode == AMDGPU::WAIT_ASYNCMARK ||
- counterTypeForInstr(Opcode).has_value();
+ AMDGPU::counterTypeForInstr(Opcode).has_value();
}
void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 33df51e8a7e07..3cec436b61701 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1835,26 +1835,6 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
-HardwareLimits::HardwareLimits(const IsaVersion &IV) {
- bool HasExtendedWaitCounts = IV.Major >= 12;
- if (HasExtendedWaitCounts) {
- LoadcntMax = getLoadcntBitMask(IV);
- DscntMax = getDscntBitMask(IV);
- } else {
- LoadcntMax = getVmcntBitMask(IV);
- DscntMax = getLgkmcntBitMask(IV);
- }
- ExpcntMax = getExpcntBitMask(IV);
- StorecntMax = getStorecntBitMask(IV);
- SamplecntMax = getSamplecntBitMask(IV);
- BvhcntMax = getBvhcntBitMask(IV);
- KmcntMax = getKmcntBitMask(IV);
- XcntMax = getXcntBitMask(IV);
- AsyncMax = getAsynccntBitMask(IV);
- VaVdstMax = DepCtr::getVaVdstBitMask();
- VmVsrcMax = DepCtr::getVmVsrcBitMask();
-}
-
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
getVmcntBitWidthLo(Version.Major));
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1f7084c8d25ae..e669add7ea980 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1101,26 +1101,6 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
/// Checks if \p Val is inside \p MD, a !range-like metadata.
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
-/// Represents the hardware counter limits for different wait count types.
-struct HardwareLimits {
- unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
- unsigned ExpcntMax;
- unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
- unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
- unsigned SamplecntMax; // gfx12+ only.
- unsigned BvhcntMax; // gfx12+ only.
- unsigned KmcntMax; // gfx12+ only.
- unsigned XcntMax; // gfx1250.
- unsigned AsyncMax; // gfx1250.
- unsigned VaVdstMax; // gfx12+ expert mode only.
- unsigned VmVsrcMax; // gfx12+ expert mode only.
-
- HardwareLimits() = default;
-
- /// Initializes hardware limits from ISA version.
- HardwareLimits(const IsaVersion &IV);
-};
-
// The following methods are only meaningful on targets that support
// S_WAITCNT.
More information about the llvm-branch-commits
mailing list