[llvm] [AMDGPU] Introduce ASYNC_CNT on GFX1250 (PR #185810)
Sameer Sahasrabuddhe via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 23:16:48 PDT 2026
https://github.com/ssahasra created https://github.com/llvm/llvm-project/pull/185810
Async operations transfer data between global memory and LDS. Their progress is tracked by the ASYNC_CNT counter on GFX1250 and later architectures. This change introduces the representation of that counter in SIInsertWaitCnts. For now, the programmer must manually insert s_wait_asyncnt instructions. Later changes will add compiler assistance for generating the waits by including this counter in the asyncmark instructions.
Assisted-by: Claude Sonnet 4.5
>From e67a7d0bf6683aeea763b30deb47fe6ec697dd5b Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Mon, 9 Mar 2026 12:00:28 +0530
Subject: [PATCH] [AMDGPU] Introduce ASYNC_CNT on GFX1250
Async operations transfer data between global memory and LDS. Their progress is
tracked by the ASYNC_CNT counter on GFX1250 and later architectures. This change
introduces the representation of that counter in SIInsertWaitCnts. For now, the
programmer must manually insert s_wait_asyncnt instructions. Later changes will
add compiler assistance for generating the waits by including this counter in
the asyncmark instructions.
Assisted-by: Claude Sonnet 4.5
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++++
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 31 ++++++++++++++-----
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 9 ++++++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++++++++++--
4 files changed, 54 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a0b6ff13e7d7a..4259bf4c1b0bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -972,6 +972,11 @@ defm Vscnt : AMDGPUSubtargetFeature<"vscnt",
/*GenPredicate=*/0
>;
+defm Asynccnt : AMDGPUSubtargetFeature<"asynccnt",
+ "Has separate asynccnt counter",
+ /*GenPredicate=*/0
+>;
+
defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst",
"Has s_get_waveid_in_workgroup instruction"
>;
@@ -2032,6 +2037,7 @@ def FeatureISAVersion12_50_Common : FeatureSet<
FeatureSupportsSRAMECC,
FeatureMaxHardClauseLength63,
FeatureWaitXcnt,
+ FeatureAsynccnt,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFlatBufferGlobalAtomicFaddF64Inst,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b07516c22cf29..a804ba35bade7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -160,7 +160,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
- DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
+ DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
+ DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
@@ -217,7 +218,7 @@ enum VmemType {
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
- AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
+ AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, AMDGPU::S_WAIT_ASYNCCNT};
static bool updateVMCntOnly(const MachineInstr &Inst) {
return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -405,6 +406,8 @@ class WaitcntGenerator {
// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+ // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
+ // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
virtual ~WaitcntGenerator() = default;
@@ -459,6 +462,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
WaitEventSet({VMEM_BVH_READ_ACCESS}),
WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
+ WaitEventSet({ASYNC_ACCESS}),
WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
VGPR_XDL_WRITE}),
WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
@@ -1314,6 +1318,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT(" << SR << "):";
break;
+ case ASYNC_CNT:
+ OS << " ASYNC_CNT(" << SR << "):";
+ break;
case VA_VDST:
OS << " VA_VDST(" << SR << "): ";
break;
@@ -1418,6 +1425,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case X_CNT:
OS << " X_CNT: " << MarkedScore;
break;
+ case ASYNC_CNT:
+ OS << " ASYNC_CNT: " << MarkedScore;
+ break;
default:
OS << " UNKNOWN: " << MarkedScore;
break;
@@ -1442,6 +1452,7 @@ void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
simplifyXcnt(CheckWait, UpdateWait);
simplifyWaitcnt(UpdateWait, VA_VDST);
simplifyVmVsrc(CheckWait, UpdateWait);
+ simplifyWaitcnt(UpdateWait, ASYNC_CNT);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1977,7 +1988,8 @@ AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
- ~0u /* XCNT */, ExpertVal, ExpertVal);
+ ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
+ ExpertVal);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -2917,6 +2929,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
ScoreBrackets->setPendingGDS();
}
+ } else if (SIInstrInfo::usesASYNC_CNT(Inst)) {
+ // Async instructions use flat encoding, so this needs to happen before the
+ // isFLAT() check below.
+ ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
} else if (TII->isFLAT(Inst)) {
if (Inst.mayLoadOrStore() && TII->mayAccessVMEMThroughFlat(Inst) &&
TII->mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))
@@ -2927,7 +2943,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// pointers so that both VM and LGKM counters are flushed.
ScoreBrackets->setPendingFlat();
} else if (Inst.isCall()) {
- // Act as a wait on everything
+ // Act as a wait on everything, but AsyncCnt is never included in such
+ // blanket waits.
ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else if (TII->isVINTERP(Inst)) {
@@ -3265,12 +3282,9 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
OldWaitcntInstr = nullptr;
if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
- // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
- //
// Asyncmarks record the current wait state and so should not allow
// waitcnts that occur after them to be merged into waitcnts that occur
// before.
- assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
ScoreBrackets.recordAsyncMark(Inst);
continue;
}
@@ -3677,7 +3691,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
.addImm(0);
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT ||
+ CT == ASYNC_CNT)
continue;
if (!ST->hasImageInsts() &&
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 488c150dd5c28..b04e5264feddc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -135,6 +135,10 @@ unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
}
+unsigned getAsynccntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
+ return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
+}
+
/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
return VersionMajor >= 12 ? 8 : 0;
@@ -1808,6 +1812,10 @@ unsigned getXcntBitMask(const IsaVersion &Version) {
return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
}
+unsigned getAsynccntBitMask(const IsaVersion &Version) {
+ return (1 << getAsynccntBitWidth(Version.Major, Version.Minor)) - 1;
+}
+
unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
@@ -1827,6 +1835,7 @@ HardwareLimits::HardwareLimits(const IsaVersion &IV) {
BvhcntMax = getBvhcntBitMask(IV);
KmcntMax = getKmcntBitMask(IV);
XcntMax = getXcntBitMask(IV);
+ AsyncMax = getAsynccntBitMask(IV);
VaVdstMax = DepCtr::getVaVdstBitMask();
VmVsrcMax = DepCtr::getVmVsrcBitMask();
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b3d20777ccfcf..9cec56090172b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1096,6 +1096,7 @@ enum InstCounterType {
BVH_CNT, // gfx12+ only.
KM_CNT, // gfx12+ only.
X_CNT, // gfx1250.
+ ASYNC_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
VM_VSRC, // gfx12+ expert mode only.
@@ -1130,6 +1131,7 @@ class Waitcnt {
unsigned BvhCnt = ~0u; // gfx12+ only.
unsigned KmCnt = ~0u; // gfx12+ only.
unsigned XCnt = ~0u; // gfx1250.
+ unsigned AsyncCnt = ~0u; // gfx1250.
unsigned VaVdst = ~0u; // gfx12+ expert scheduling mode only.
unsigned VmVsrc = ~0u; // gfx12+ expert scheduling mode only.
@@ -1152,6 +1154,8 @@ class Waitcnt {
return KmCnt;
case X_CNT:
return XCnt;
+ case ASYNC_CNT:
+ return AsyncCnt;
case VA_VDST:
return VaVdst;
case VM_VSRC:
@@ -1186,6 +1190,9 @@ class Waitcnt {
case X_CNT:
XCnt = Val;
break;
+ case ASYNC_CNT:
+ AsyncCnt = Val;
+ break;
case VA_VDST:
VaVdst = Val;
break;
@@ -1205,10 +1212,10 @@ class Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
- unsigned VaVdst, unsigned VmVsrc)
+ unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc)
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt),
- VaVdst(VaVdst), VmVsrc(VmVsrc) {}
+ AsyncCnt(AsyncCnt), VaVdst(VaVdst), VmVsrc(VmVsrc) {}
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
@@ -1230,7 +1237,8 @@ class Waitcnt {
std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt),
- std::min(VaVdst, Other.VaVdst), std::min(VmVsrc, Other.VmVsrc));
+ std::min(AsyncCnt, Other.AsyncCnt), std::min(VaVdst, Other.VaVdst),
+ std::min(VmVsrc, Other.VmVsrc));
}
friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait);
@@ -1246,6 +1254,7 @@ struct HardwareLimits {
unsigned BvhcntMax; // gfx12+ only.
unsigned KmcntMax; // gfx12+ only.
unsigned XcntMax; // gfx1250.
+ unsigned AsyncMax; // gfx1250.
unsigned VaVdstMax; // gfx12+ expert mode only.
unsigned VmVsrcMax; // gfx12+ expert mode only.
@@ -1349,6 +1358,10 @@ unsigned getSamplecntBitMask(const IsaVersion &Version);
/// Returns 0 for versions that do not support BVHcnt
unsigned getBvhcntBitMask(const IsaVersion &Version);
+/// \returns Asynccnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support Asynccnt
+unsigned getAsynccntBitMask(const IsaVersion &Version);
+
/// \returns Dscnt bit mask for given isa \p Version.
/// Returns 0 for versions that do not support DScnt
unsigned getDscntBitMask(const IsaVersion &Version);
More information about the llvm-commits
mailing list