[clang] [llvm] [AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling (PR #169345)
Pankaj Dwivedi via cfe-commits
cfe-commits at lists.llvm.org
Mon Jan 5 08:27:57 PST 2026
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/169345
>From 48c7b23636cf18645c1bc01b3f6f367130154e4a Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Nov 2025 22:57:32 +0530
Subject: [PATCH 01/10] [AMDGPU] Add -amdgpu-expand-waitcnt-profiling option
for PC-sampling profiling
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 194 +++++++++++++--
.../AMDGPU/expand-waitcnt-profiling.ll | 230 ++++++++++++++++++
2 files changed, 402 insertions(+), 22 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 70db7b4918515..b86a75e9b04ed 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,6 +63,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> ExpandWaitcntProfiling(
+ "amdgpu-expand-waitcnt-profiling",
+ cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false),
+ cl::Hidden);
+
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -297,6 +302,30 @@ class WaitcntGenerator {
// optimization.
bool isOptNone() const { return OptNone; }
+ // Get the maximum wait count value for a given counter type
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return AMDGPU::getLoadcntBitMask(IV);
+ case DS_CNT:
+ return AMDGPU::getDscntBitMask(IV);
+ case EXP_CNT:
+ return AMDGPU::getExpcntBitMask(IV);
+ case STORE_CNT:
+ return AMDGPU::getStorecntBitMask(IV);
+ case SAMPLE_CNT:
+ return AMDGPU::getSamplecntBitMask(IV);
+ case BVH_CNT:
+ return AMDGPU::getBvhcntBitMask(IV);
+ case KM_CNT:
+ return AMDGPU::getKmcntBitMask(IV);
+ case X_CNT:
+ return 0; // No hardware limit for XCNT
+ default:
+ return 0;
+ }
+ }
+
// Edits an existing sequence of wait count instructions according
// to an incoming Waitcnt value, which is itself updated to reflect
// any new wait count instructions which may need to be generated by
@@ -318,9 +347,11 @@ class WaitcntGenerator {
// Generates new wait count instructions according to the value of
// Wait, returning true if any new instructions were created.
+ // If ScoreBrackets is provided, it can be used for profiling expansion.
virtual bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) = 0;
+ AMDGPU::Waitcnt Wait,
+ WaitcntBrackets *ScoreBrackets = nullptr) = 0;
// Returns an array of bit masks which can be used to map values in
// WaitEventType to corresponding counter values in InstCounterType.
@@ -356,7 +387,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
+ AMDGPU::Waitcnt Wait,
+ WaitcntBrackets *ScoreBrackets = nullptr) override;
const unsigned *getWaitEventMask() const override {
assert(ST);
@@ -393,7 +425,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
+ AMDGPU::Waitcnt Wait,
+ WaitcntBrackets *ScoreBrackets = nullptr) override;
const unsigned *getWaitEventMask() const override {
assert(ST);
@@ -1527,38 +1560,104 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
/// required counters in \p Wait
bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
+ AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
assert(ST);
assert(isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
+ // Helper to emit expanded waitcnt sequence for profiling.
+ // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
+ // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
+ auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+ auto EmitWaitcnt) {
+ if (Outstanding > Target) {
+ for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
+ EmitWaitcnt(i);
+ Modified = true;
+ }
+ } else {
+ EmitWaitcnt(Target);
+ Modified = true;
+ }
+ };
+
// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
// single instruction while VScnt has its own instruction.
if (Wait.hasWaitExceptStoreCnt()) {
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Modified = true;
+ // If profiling expansion is enabled and we have score brackets,
+ // emit an expanded sequence
+ if (ExpandWaitcntProfiling && ScoreBrackets) {
+ if (Wait.LoadCnt != ~0u) {
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(LOAD_CNT) -
+ ScoreBrackets->getScoreLB(LOAD_CNT),
+ getWaitCountMax(LOAD_CNT) - 1);
+ emitExpandedWaitcnt(Outstanding, Wait.LoadCnt, [&](unsigned Count) {
+ AMDGPU::Waitcnt W;
+ W.LoadCnt = Count;
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(AMDGPU::encodeWaitcnt(IV, W));
+ });
+ }
+ if (Wait.DsCnt != ~0u) {
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(DS_CNT) -
+ ScoreBrackets->getScoreLB(DS_CNT),
+ getWaitCountMax(DS_CNT) - 1);
+ emitExpandedWaitcnt(Outstanding, Wait.DsCnt, [&](unsigned Count) {
+ AMDGPU::Waitcnt W;
+ W.DsCnt = Count;
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(AMDGPU::encodeWaitcnt(IV, W));
+ });
+ }
+ if (Wait.ExpCnt != ~0u) {
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(EXP_CNT) -
+ ScoreBrackets->getScoreLB(EXP_CNT),
+ getWaitCountMax(EXP_CNT) - 1);
+ emitExpandedWaitcnt(Outstanding, Wait.ExpCnt, [&](unsigned Count) {
+ AMDGPU::Waitcnt W;
+ W.ExpCnt = Count;
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(AMDGPU::encodeWaitcnt(IV, W));
+ });
+ }
+ } else {
+ // Normal behavior: emit single combined waitcnt
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
}
if (Wait.hasWaitStoreCnt()) {
assert(ST->hasVscnt());
- [[maybe_unused]] auto SWaitInst =
+ if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) {
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
+ ScoreBrackets->getScoreLB(STORE_CNT),
+ getWaitCountMax(STORE_CNT) - 1);
+ emitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.StoreCnt);
- Modified = true;
+ .addImm(Count);
+ });
+ } else {
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.StoreCnt);
+ Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
}
return Modified;
@@ -1790,13 +1889,47 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
+ AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
assert(ST);
assert(!isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
+ // Helper to emit expanded waitcnt sequence for profiling.
+ auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+ auto EmitWaitcnt) {
+ if (Outstanding > Target) {
+ for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
+ EmitWaitcnt(i);
+ Modified = true;
+ }
+ } else {
+ EmitWaitcnt(Target);
+ Modified = true;
+ }
+ };
+
+ // For GFX12+, we use separate wait instructions, which makes expansion
+ // simpler
+ if (ExpandWaitcntProfiling && ScoreBrackets) {
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ unsigned Count = getWait(Wait, CT);
+ if (Count == ~0u)
+ continue;
+
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+ ScoreBrackets->getScoreLB(CT),
+ getWaitCountMax(CT) - 1);
+ emitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
+ BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Val);
+ });
+ }
+ return Modified;
+ }
+
+ // Normal behavior (no expansion)
// Check for opportunities to use combined wait instructions.
if (Wait.DsCnt != ~0u) {
MachineInstr *SWaitInst = nullptr;
@@ -2162,9 +2295,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
Modified =
WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
- // Any counts that could have been applied to any existing waitcnt
- // instructions will have been done so, now deal with any remaining.
- ScoreBrackets.applyWaitcnt(Wait);
+ AMDGPU::Waitcnt WaitForScore = Wait;
// ExpCnt can be merged into VINTERP.
if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -2181,9 +2312,28 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- if (WCG->createNewWaitcnt(Block, It, Wait))
+ // XCnt may be already consumed by a load wait.
+ // When we wait for KmCnt==0, all SMEM operations (including address
+ // translations) are complete, so XCNT wait is redundant. When we wait for
+ // LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address
+ // translations are complete (since XCnt follows LoadCnt for loads). When the
+ // current instruction is a VMEM access, translations are in-order.
+ if (Wait.XCnt != ~0u) {
+ if (Wait.KmCnt == 0)
+ Wait.XCnt = ~0u;
+ else if (Wait.LoadCnt == 0 && Wait.XCnt == 0)
+ Wait.XCnt = ~0u;
+ else if (isVmemAccess(*It))
+ Wait.XCnt = ~0u;
+ }
+
+ if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
Modified = true;
+ // Any counts that could have been applied to any existing waitcnt
+ // instructions will have been done so, now deal with any remaining.
+ ScoreBrackets.applyWaitcnt(WaitForScore);
+
return Modified;
}
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
new file mode 100644
index 0000000000000..3daf3142f2a96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
+
+; This test demonstrates the waitcnt expansion feature for PC-sampling profiling.
+; The expansion transforms a single waitcnt instruction into a sequence of waitcnts
+; with decreasing counter values to help identify which specific memory operation
+; is causing a bottleneck.
+;
+; The kernels below keep multiple memory operations in flight before each waitcnt
+; so that ScoreBrackets tracks a non-zero number of outstanding events. When
+; -amdgpu-expand-waitcnt-profiling is enabled, each combined wait is expanded
+; into a descending sequence (e.g. outstanding=3 emits lgkmcnt(2), (1), (0))
+; which lets PC-sampling attribute long-latency stalls to the specific operation.
+
+define amdgpu_kernel void @case1_single_counter_lgkmcnt(
+; EXPAND-LABEL: case1_single_counter_lgkmcnt:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(2)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_add_i32 s0, s0, s1
+; EXPAND-NEXT: s_add_i32 s0, s0, s2
+; EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case1_single_counter_lgkmcnt:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; NOEXPAND-NEXT: s_add_i32 s0, s0, s2
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(4) %ptr_a,
+ ptr addrspace(4) %ptr_b,
+ ptr addrspace(4) %ptr_c,
+ ptr addrspace(1) %out) {
+ ; Three scalar loads - increment lgkmcnt
+ %val_a = load i32, ptr addrspace(4) %ptr_a, align 4
+ %val_b = load i32, ptr addrspace(4) %ptr_b, align 4
+ %val_c = load i32, ptr addrspace(4) %ptr_c, align 4
+
+ ; Use all three values
+ %sum1 = add i32 %val_a, %val_b
+ %sum2 = add i32 %sum1, %val_c
+
+ store i32 %sum2, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @case2_independent_counters(
+; EXPAND-LABEL: case2_independent_counters:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_add_i32 s0, s4, s5
+; EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case2_independent_counters:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_add_i32 s0, s4, s5
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(1) %global_ptr,
+ ptr addrspace(4) %scalar_ptr,
+ ptr addrspace(1) %out) {
+ ; Global memory load - increments vmcnt
+ %global_val = load i32, ptr addrspace(1) %global_ptr, align 4
+
+ ; Scalar memory load - increments lgkmcnt
+ %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4
+
+ ; Use both values - compiler must wait for both counters
+ %result = add i32 %global_val, %scalar_val
+
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @case3_overlapping_counters(
+; EXPAND-LABEL: case3_overlapping_counters:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: v_mov_b32_e32 v1, 1
+; EXPAND-NEXT: v_mov_b32_e32 v2, 2
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: s_add_u32 s2, s2, s6
+; EXPAND-NEXT: s_addc_u32 s3, s3, s7
+; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case3_overlapping_counters:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
+; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: s_add_u32 s2, s2, s6
+; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7
+; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(1) %buf,
+ ptr addrspace(1) %data,
+ i64 %offset) {
+ ; Issue 12 stores to buffer - each increments vmcnt
+ %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0
+ store volatile i32 1, ptr addrspace(1) %ptr0, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1
+ store volatile i32 2, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2
+ store volatile i32 1, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3
+ store volatile i32 2, ptr addrspace(1) %ptr3, align 4
+ %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4
+ store volatile i32 1, ptr addrspace(1) %ptr4, align 4
+ %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5
+ store volatile i32 2, ptr addrspace(1) %ptr5, align 4
+ %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6
+ store volatile i32 1, ptr addrspace(1) %ptr6, align 4
+ %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7
+ store volatile i32 2, ptr addrspace(1) %ptr7, align 4
+ %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8
+ store volatile i32 1, ptr addrspace(1) %ptr8, align 4
+ %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9
+ store volatile i32 2, ptr addrspace(1) %ptr9, align 4
+ %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10
+ store volatile i32 1, ptr addrspace(1) %ptr10, align 4
+ %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11
+ store volatile i32 2, ptr addrspace(1) %ptr11, align 4
+
+ ; Load from potentially aliasing address - also increments vmcnt
+ %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset
+ %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4
+
+ ; Store the loaded value
+ %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12
+ store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4
+
+ ret void
+}
>From a28ab4e6c7b0079a0b7d38aab7ec4cc1a3d926af Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 26 Nov 2025 13:09:43 +0530
Subject: [PATCH 02/10] add run line for diff GPU Gen and counter types
---
.../AMDGPU/expand-waitcnt-profiling.ll | 790 +++++++++++++-----
1 file changed, 577 insertions(+), 213 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index 3daf3142f2a96..6a0b053d315de 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -1,230 +1,594 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
-
-; This test demonstrates the waitcnt expansion feature for PC-sampling profiling.
-; The expansion transforms a single waitcnt instruction into a sequence of waitcnts
-; with decreasing counter values to help identify which specific memory operation
-; is causing a bottleneck.
-;
-; The kernels below keep multiple memory operations in flight before each waitcnt
-; so that ScoreBrackets tracks a non-zero number of outstanding events. When
-; -amdgpu-expand-waitcnt-profiling is enabled, each combined wait is expanded
-; into a descending sequence (e.g. outstanding=3 emits lgkmcnt(2), (1), (0))
-; which lets PC-sampling attribute long-latency stalls to the specific operation.
-
-define amdgpu_kernel void @case1_single_counter_lgkmcnt(
-; EXPAND-LABEL: case1_single_counter_lgkmcnt:
-; EXPAND: ; %bb.0:
-; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
-; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
-; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; EXPAND-NEXT: s_waitcnt lgkmcnt(2)
-; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; EXPAND-NEXT: s_add_i32 s0, s0, s1
-; EXPAND-NEXT: s_add_i32 s0, s0, s2
-; EXPAND-NEXT: v_mov_b32_e32 v1, s0
-; EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
-; EXPAND-NEXT: s_endpgm
-;
-; NOEXPAND-LABEL: case1_single_counter_lgkmcnt:
-; NOEXPAND: ; %bb.0:
-; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
-; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
-; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; NOEXPAND-NEXT: s_add_i32 s0, s0, s1
-; NOEXPAND-NEXT: s_add_i32 s0, s0, s2
-; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
-; NOEXPAND-NEXT: s_endpgm
- ptr addrspace(4) %ptr_a,
- ptr addrspace(4) %ptr_b,
- ptr addrspace(4) %ptr_c,
- ptr addrspace(1) %out) {
- ; Three scalar loads - increment lgkmcnt
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX9-EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9-NOEXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX10-EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10-NOEXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX11-EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11-NOEXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX12-EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12-NOEXPAND %s
+
+; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding
+; operations, instead of emitting a single waitcnt(target), we emit:
+; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target)
+;
+; This allows PC-sampling profilers to identify which specific operation
+; is causing a stall by observing where the program counter is stuck.
+
+define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) {
+; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX9-EXPAND: ; %bb.0:
+; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(2)
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX9-NOEXPAND: ; %bb.0:
+; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX10-EXPAND: ; %bb.0:
+; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(2)
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX10-NOEXPAND: ; %bb.0:
+; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX11-EXPAND: ; %bb.0:
+; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(2)
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2
+; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX11-NOEXPAND: ; %bb.0:
+; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX12-EXPAND: ; %bb.0:
+; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x2
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2
+; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX12-NOEXPAND: ; %bb.0:
+; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2
+; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
+; GFX12-NOEXPAND-NEXT: s_endpgm
+
%val_a = load i32, ptr addrspace(4) %ptr_a, align 4
%val_b = load i32, ptr addrspace(4) %ptr_b, align 4
%val_c = load i32, ptr addrspace(4) %ptr_c, align 4
-
- ; Use all three values
%sum1 = add i32 %val_a, %val_b
%sum2 = add i32 %sum1, %val_c
-
store i32 %sum2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @case2_independent_counters(
-; EXPAND-LABEL: case2_independent_counters:
-; EXPAND: ; %bb.0:
-; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
-; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; EXPAND-NEXT: s_add_i32 s0, s4, s5
-; EXPAND-NEXT: v_mov_b32_e32 v1, s0
-; EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
-; EXPAND-NEXT: s_endpgm
-;
-; NOEXPAND-LABEL: case2_independent_counters:
-; NOEXPAND: ; %bb.0:
-; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
-; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; NOEXPAND-NEXT: s_add_i32 s0, s4, s5
-; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
-; NOEXPAND-NEXT: s_endpgm
- ptr addrspace(1) %global_ptr,
- ptr addrspace(4) %scalar_ptr,
- ptr addrspace(1) %out) {
- ; Global memory load - increments vmcnt
- %global_val = load i32, ptr addrspace(1) %global_ptr, align 4
-
- ; Scalar memory load - increments lgkmcnt
- %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4
-
- ; Use both values - compiler must wait for both counters
- %result = add i32 %global_val, %scalar_val
+define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) {
+; GFX9-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX9-EXPAND: ; %bb.0:
+; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
+; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
+; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2)
+; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1)
+; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX9-NOEXPAND: ; %bb.0:
+; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
+; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
+; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX10-EXPAND: ; %bb.0:
+; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: s_clause 0x2
+; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
+; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
+; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2)
+; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1)
+; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX10-NOEXPAND: ; %bb.0:
+; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: s_clause 0x2
+; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
+; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
+; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX11-EXPAND: ; %bb.0:
+; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_clause 0x2
+; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
+; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
+; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2)
+; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1)
+; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX11-NOEXPAND: ; %bb.0:
+; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_clause 0x2
+; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
+; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
+; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX12-EXPAND: ; %bb.0:
+; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_clause 0x2
+; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
+; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
+; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2
+; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1
+; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0
+; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX12-NOEXPAND: ; %bb.0:
+; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_clause 0x2
+; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
+; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
+; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
+; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-NOEXPAND-NEXT: s_endpgm
+
+ ; Use thread ID to create thread-varying addresses -> forces vector loads
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid64 = zext i32 %tid to i64
- store i32 %result, ptr addrspace(1) %out, align 4
+ ; Three separate global loads with thread-varying addresses
+ ; Non-volatile loads allow multiple operations to be in-flight
+ %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
+ %val0 = load i32, ptr addrspace(1) %ptr0, align 4
+
+ %offset1 = add i64 %tid64, 64
+ %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
+ %val1 = load i32, ptr addrspace(1) %ptr1, align 4
+
+ %offset2 = add i64 %tid64, 128
+ %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
+ %val2 = load i32, ptr addrspace(1) %ptr2, align 4
+
+ %sum1 = add i32 %val0, %val1
+ %sum2 = add i32 %sum1, %val2
+
+ %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64
+ store i32 %sum2, ptr addrspace(1) %out_ptr, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) {
+; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX9-EXPAND: ; %bb.0:
+; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
+; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX9-NOEXPAND: ; %bb.0:
+; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX10-EXPAND: ; %bb.0:
+; GFX10-EXPAND-NEXT: s_clause 0x1
+; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
+; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX10-NOEXPAND: ; %bb.0:
+; GFX10-NOEXPAND-NEXT: s_clause 0x1
+; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX11-EXPAND: ; %bb.0:
+; GFX11-EXPAND-NEXT: s_clause 0x1
+; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX11-NOEXPAND: ; %bb.0:
+; GFX11-NOEXPAND-NEXT: s_clause 0x1
+; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX12-EXPAND: ; %bb.0:
+; GFX12-EXPAND-NEXT: s_clause 0x1
+; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8
+; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1
+; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0
+; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX12-NOEXPAND: ; %bb.0:
+; GFX12-NOEXPAND-NEXT: s_clause 0x1
+; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8
+; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1
+; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0
+; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NOEXPAND-NEXT: s_endpgm
+
+ %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0
+ %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
+ %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2
+ %val0 = load i32, ptr addrspace(3) %ptr0, align 4
+ %val1 = load i32, ptr addrspace(3) %ptr1, align 4
+ %val2 = load i32, ptr addrspace(3) %ptr2, align 4
+ %sum1 = add i32 %val0, %val1
+ %sum2 = add i32 %sum1, %val2
+ store i32 %sum2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @case3_overlapping_counters(
-; EXPAND-LABEL: case3_overlapping_counters:
-; EXPAND: ; %bb.0:
-; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; EXPAND-NEXT: v_mov_b32_e32 v1, 1
-; EXPAND-NEXT: v_mov_b32_e32 v2, 2
-; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: s_add_u32 s2, s2, s6
-; EXPAND-NEXT: s_addc_u32 s3, s3, s7
-; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
-; EXPAND-NEXT: s_waitcnt vmcnt(0)
-; EXPAND-NEXT: s_endpgm
-;
-; NOEXPAND-LABEL: case3_overlapping_counters:
-; NOEXPAND: ; %bb.0:
-; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
-; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
-; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: s_add_u32 s2, s2, s6
-; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7
-; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
-; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; NOEXPAND-NEXT: s_endpgm
- ptr addrspace(1) %buf,
- ptr addrspace(1) %data,
- i64 %offset) {
- ; Issue 12 stores to buffer - each increments vmcnt
- %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0
- store volatile i32 1, ptr addrspace(1) %ptr0, align 4
- %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1
- store volatile i32 2, ptr addrspace(1) %ptr1, align 4
- %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2
- store volatile i32 1, ptr addrspace(1) %ptr2, align 4
- %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3
- store volatile i32 2, ptr addrspace(1) %ptr3, align 4
- %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4
- store volatile i32 1, ptr addrspace(1) %ptr4, align 4
- %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5
- store volatile i32 2, ptr addrspace(1) %ptr5, align 4
- %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6
- store volatile i32 1, ptr addrspace(1) %ptr6, align 4
- %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7
- store volatile i32 2, ptr addrspace(1) %ptr7, align 4
- %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8
- store volatile i32 1, ptr addrspace(1) %ptr8, align 4
- %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9
- store volatile i32 2, ptr addrspace(1) %ptr9, align 4
- %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10
- store volatile i32 1, ptr addrspace(1) %ptr10, align 4
- %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11
- store volatile i32 2, ptr addrspace(1) %ptr11, align 4
-
- ; Load from potentially aliasing address - also increments vmcnt
- %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset
- %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4
-
- ; Store the loaded value
- %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12
- store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4
+define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) {
+; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX9-EXPAND: ; %bb.0:
+; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX9-NOEXPAND: ; %bb.0:
+; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX10-EXPAND: ; %bb.0:
+; GFX10-EXPAND-NEXT: s_clause 0x1
+; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX10-NOEXPAND: ; %bb.0:
+; GFX10-NOEXPAND-NEXT: s_clause 0x1
+; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX11-EXPAND: ; %bb.0:
+; GFX11-EXPAND-NEXT: s_clause 0x1
+; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX11-NOEXPAND: ; %bb.0:
+; GFX11-NOEXPAND-NEXT: s_clause 0x1
+; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX12-EXPAND: ; %bb.0:
+; GFX12-EXPAND-NEXT: s_clause 0x1
+; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX12-NOEXPAND: ; %bb.0:
+; GFX12-NOEXPAND-NEXT: s_clause 0x1
+; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX12-NOEXPAND-NEXT: s_endpgm
+
+ %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4
+ %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4
+ %result = add i32 %scalar_val1, %scalar_val2
+ store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
>From 7e993fb33983b5a1912a840940f6d18d3ab14b06 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 26 Nov 2025 18:49:41 +0530
Subject: [PATCH 03/10] Address reviewer feedback: fix getWaitCountMax and
reduce code duplication
- Fix getWaitCountMax() to use correct bitmasks based on architecture:
- Pre-GFX12: Use getVmcntBitMask/getLgkmcntBitMask for LOAD_CNT/DS_CNT
- GFX12+: Use getLoadcntBitMask/getDscntBitMask for LOAD_CNT/DS_CNT
- Refactor repetitive if-blocks for LOAD_CNT, DS_CNT, EXP_CNT into
a single loop using getCounterRef helper function
- Fix X_CNT to return proper getXcntBitMask(IV) instead of 0
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 50 ++++++++-------------
1 file changed, 18 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b86a75e9b04ed..5aba3e2833b9c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -302,13 +302,17 @@ class WaitcntGenerator {
// optimization.
bool isOptNone() const { return OptNone; }
- // Get the maximum wait count value for a given counter type
+ // Get the maximum wait count value for a given counter type.
+ // For pre-GFX12, LOAD_CNT uses vmcnt and DS_CNT uses lgkmcnt.
+ // For GFX12+, LOAD_CNT uses loadcnt and DS_CNT uses dscnt.
unsigned getWaitCountMax(InstCounterType T) const {
switch (T) {
case LOAD_CNT:
- return AMDGPU::getLoadcntBitMask(IV);
+ return ST->hasExtendedWaitCounts() ? AMDGPU::getLoadcntBitMask(IV)
+ : AMDGPU::getVmcntBitMask(IV);
case DS_CNT:
- return AMDGPU::getDscntBitMask(IV);
+ return ST->hasExtendedWaitCounts() ? AMDGPU::getDscntBitMask(IV)
+ : AMDGPU::getLgkmcntBitMask(IV);
case EXP_CNT:
return AMDGPU::getExpcntBitMask(IV);
case STORE_CNT:
@@ -320,7 +324,7 @@ class WaitcntGenerator {
case KM_CNT:
return AMDGPU::getKmcntBitMask(IV);
case X_CNT:
- return 0; // No hardware limit for XCNT
+ return AMDGPU::getXcntBitMask(IV);
default:
return 0;
}
@@ -1589,35 +1593,17 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
// If profiling expansion is enabled and we have score brackets,
// emit an expanded sequence
if (ExpandWaitcntProfiling && ScoreBrackets) {
- if (Wait.LoadCnt != ~0u) {
- unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(LOAD_CNT) -
- ScoreBrackets->getScoreLB(LOAD_CNT),
- getWaitCountMax(LOAD_CNT) - 1);
- emitExpandedWaitcnt(Outstanding, Wait.LoadCnt, [&](unsigned Count) {
- AMDGPU::Waitcnt W;
- W.LoadCnt = Count;
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
- .addImm(AMDGPU::encodeWaitcnt(IV, W));
- });
- }
- if (Wait.DsCnt != ~0u) {
- unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(DS_CNT) -
- ScoreBrackets->getScoreLB(DS_CNT),
- getWaitCountMax(DS_CNT) - 1);
- emitExpandedWaitcnt(Outstanding, Wait.DsCnt, [&](unsigned Count) {
- AMDGPU::Waitcnt W;
- W.DsCnt = Count;
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
- .addImm(AMDGPU::encodeWaitcnt(IV, W));
- });
- }
- if (Wait.ExpCnt != ~0u) {
- unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(EXP_CNT) -
- ScoreBrackets->getScoreLB(EXP_CNT),
- getWaitCountMax(EXP_CNT) - 1);
- emitExpandedWaitcnt(Outstanding, Wait.ExpCnt, [&](unsigned Count) {
+ for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+ unsigned &WaitCnt = getCounterRef(Wait, CT);
+ if (WaitCnt == ~0u)
+ continue;
+
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+ ScoreBrackets->getScoreLB(CT),
+ getWaitCountMax(CT) - 1);
+ emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
AMDGPU::Waitcnt W;
- W.ExpCnt = Count;
+ getCounterRef(W, CT) = Count;
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
.addImm(AMDGPU::encodeWaitcnt(IV, W));
});
>From 709640d569e7ea3c886dc98a3b51c04aaff4fd70 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 2 Dec 2025 13:20:55 +0530
Subject: [PATCH 04/10] skip expanding out-of-order events
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 54 ++++--
.../AMDGPU/expand-waitcnt-profiling.ll | 163 +++++++++++++++---
2 files changed, 185 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5aba3e2833b9c..bbf73dd4d748c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1593,20 +1593,40 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
// If profiling expansion is enabled and we have score brackets,
// emit an expanded sequence
if (ExpandWaitcntProfiling && ScoreBrackets) {
+ // Check if any of the counters to be waited on are out-of-order.
+ // If so, fall back to normal (non-expanded) behavior since expansion
+ // would provide misleading profiling information.
+ bool AnyOutOfOrder = false;
for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
unsigned &WaitCnt = getCounterRef(Wait, CT);
- if (WaitCnt == ~0u)
- continue;
+ if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
+ AnyOutOfOrder = true;
+ break;
+ }
+ }
+
+ if (AnyOutOfOrder) {
+ // Fall back to non-expanded wait
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ } else {
+ // All counters are in-order, safe to expand
+ for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+ unsigned &WaitCnt = getCounterRef(Wait, CT);
+ if (WaitCnt == ~0u)
+ continue;
- unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
- ScoreBrackets->getScoreLB(CT),
- getWaitCountMax(CT) - 1);
- emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
- AMDGPU::Waitcnt W;
- getCounterRef(W, CT) = Count;
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
- .addImm(AMDGPU::encodeWaitcnt(IV, W));
- });
+ unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+ ScoreBrackets->getScoreLB(CT),
+ getWaitCountMax(CT) - 1);
+ emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
+ AMDGPU::Waitcnt W;
+ getCounterRef(W, CT) = Count;
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(AMDGPU::encodeWaitcnt(IV, W));
+ });
+ }
}
} else {
// Normal behavior: emit single combined waitcnt
@@ -1624,7 +1644,9 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
if (Wait.hasWaitStoreCnt()) {
assert(ST->hasVscnt());
- if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) {
+ if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
+ !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
+ // Only expand if counter is not out-of-order
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
ScoreBrackets->getScoreLB(STORE_CNT),
getWaitCountMax(STORE_CNT) - 1);
@@ -1904,6 +1926,14 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
if (Count == ~0u)
continue;
+ // Skip expansion for out-of-order counters - emit normal wait instead
+ if (ScoreBrackets->counterOutOfOrder(CT)) {
+ BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Count);
+ Modified = true;
+ continue;
+ }
+
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
ScoreBrackets->getScoreLB(CT),
getWaitCountMax(CT) - 1);
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index 6a0b053d315de..ec30477e34ba2 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -24,8 +24,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt
; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2
@@ -56,8 +54,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt
; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(2)
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2
@@ -87,8 +83,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt
; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(2)
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -119,8 +113,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt
; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x2
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -313,7 +305,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
@@ -346,7 +337,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
; GFX10-EXPAND-NEXT: s_clause 0x1
; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
@@ -381,7 +371,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
; GFX11-EXPAND-NEXT: s_clause 0x1
; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
@@ -416,7 +405,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
; GFX12-EXPAND-NEXT: s_clause 0x1
; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
@@ -464,11 +452,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_
; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
@@ -495,11 +481,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_
; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
@@ -526,11 +510,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_
; GFX11-EXPAND-NEXT: s_clause 0x1
; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -558,11 +540,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_
; GFX12-EXPAND-NEXT: s_clause 0x1
; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -592,3 +572,146 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
+
+; Test that expansion is NOT applied when counters are out-of-order (mixed event types).
+; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete
+; out-of-order relative to each other. When both are in-flight, we should NOT expand
+; because the expansion would be misleading.
+define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) {
+; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX9-EXPAND: ; %bb.0:
+; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
+; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX9-NOEXPAND: ; %bb.0:
+; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX10-EXPAND: ; %bb.0:
+; GFX10-EXPAND-NEXT: s_clause 0x1
+; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
+; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX10-NOEXPAND: ; %bb.0:
+; GFX10-NOEXPAND-NEXT: s_clause 0x1
+; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX11-EXPAND: ; %bb.0:
+; GFX11-EXPAND-NEXT: s_clause 0x1
+; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
+; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX11-NOEXPAND: ; %bb.0:
+; GFX11-NOEXPAND-NEXT: s_clause 0x1
+; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX12-EXPAND: ; %bb.0:
+; GFX12-EXPAND-NEXT: s_clause 0x1
+; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
+; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0
+; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX12-NOEXPAND: ; %bb.0:
+; GFX12-NOEXPAND-NEXT: s_clause 0x1
+; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0
+; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX12-NOEXPAND-NEXT: s_endpgm
+
+ %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4
+ %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4
+ %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
+ %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4
+ %sum1 = add i32 %lds_val1, %lds_val2
+ %sum2 = add i32 %sum1, %smem_val
+ store i32 %sum2, ptr addrspace(1) %out, align 4
+ ret void
+}
>From 399166c09c9dfa699bef321e238e55d4501ba600 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 2 Dec 2025 18:32:41 +0530
Subject: [PATCH 05/10] fix: resolve issue after rebase
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 15 ---------------
1 file changed, 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index bbf73dd4d748c..d189733171bb1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2328,21 +2328,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- // XCnt may be already consumed by a load wait.
- // When we wait for KmCnt==0, all SMEM operations (including address
- // translations) are complete, so XCNT wait is redundant. When we wait for
- // LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address
- // translations are complete (since XCnt follows LoadCnt for loads). When the
- // current instruction is a VMEM access, translations are in-order.
- if (Wait.XCnt != ~0u) {
- if (Wait.KmCnt == 0)
- Wait.XCnt = ~0u;
- else if (Wait.LoadCnt == 0 && Wait.XCnt == 0)
- Wait.XCnt = ~0u;
- else if (isVmemAccess(*It))
- Wait.XCnt = ~0u;
- }
-
if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
Modified = true;
>From 019504b45ca33f1e470023010b1adc2e4147b3eb Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 3 Dec 2025 15:05:16 +0530
Subject: [PATCH 06/10] add more test
---
.../AMDGPU/expand-waitcnt-profiling.ll | 225 ++++++++++++++++++
1 file changed, 225 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index ec30477e34ba2..b15b58f6a58d8 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -715,3 +715,228 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
store i32 %sum2, ptr addrspace(1) %out, align 4
ret void
}
+
+define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) {
+; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+)
+; GFX9-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX9-EXPAND: ; %bb.0: ; %entry
+; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1
+; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3
+; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512
+; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX9-NOEXPAND: ; %bb.0: ; %entry
+; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1
+; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3
+; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512
+; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX10-EXPAND: ; %bb.0: ; %entry
+; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3
+; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256
+; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512
+; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX10-NOEXPAND: ; %bb.0: ; %entry
+; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3
+; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256
+; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512
+; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX11-EXPAND: ; %bb.0: ; %entry
+; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT: s_clause 0x2
+; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
+; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
+; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX11-NOEXPAND: ; %bb.0: ; %entry
+; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT: s_clause 0x2
+; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
+; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
+; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX12-EXPAND: ; %bb.0: ; %entry
+; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT: s_clause 0x2
+; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
+; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
+; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX12-NOEXPAND: ; %bb.0: ; %entry
+; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT: s_clause 0x2
+; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
+; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
+; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0
+; GFX12-NOEXPAND-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid64 = zext i32 %tid to i64
+
+ ; Issue multiple stores
+ %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
+ store i32 1, ptr addrspace(1) %ptr0, align 4
+
+ %offset1 = add i64 %tid64, 64
+ %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
+ store i32 2, ptr addrspace(1) %ptr1, align 4
+
+ %offset2 = add i64 %tid64, 128
+ %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
+ store i32 3, ptr addrspace(1) %ptr2, align 4
+
+ ; Memory fence forces wait for all stores
+ fence release
+ ret void
+}
+
+define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) {
+; Test export operations (EXP_CNT/expcnt)
+; GFX9-EXPAND-LABEL: test_expcnt_exports:
+; GFX9-EXPAND: ; %bb.0: ; %entry
+; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
+; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
+; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
+; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done
+; GFX9-EXPAND-NEXT: s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX9-NOEXPAND: ; %bb.0: ; %entry
+; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
+; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
+; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
+; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done
+; GFX9-NOEXPAND-NEXT: s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_expcnt_exports:
+; GFX10-EXPAND: ; %bb.0: ; %entry
+; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
+; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
+; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
+; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done
+; GFX10-EXPAND-NEXT: s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX10-NOEXPAND: ; %bb.0: ; %entry
+; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
+; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
+; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
+; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done
+; GFX10-NOEXPAND-NEXT: s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_expcnt_exports:
+; GFX11-EXPAND: ; %bb.0: ; %entry
+; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
+; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
+; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
+; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done
+; GFX11-EXPAND-NEXT: s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX11-NOEXPAND: ; %bb.0: ; %entry
+; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
+; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
+; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
+; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done
+; GFX11-NOEXPAND-NEXT: s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_expcnt_exports:
+; GFX12-EXPAND: ; %bb.0: ; %entry
+; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3
+; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0
+; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2
+; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done
+; GFX12-EXPAND-NEXT: s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX12-NOEXPAND: ; %bb.0: ; %entry
+; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3
+; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0
+; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2
+; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done
+; GFX12-NOEXPAND-NEXT: s_endpgm
+entry:
+ ; Multiple MRT exports
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false)
+ call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false)
+ call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false)
+ ; Final export with done bit
+ call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false)
+ ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
>From 591a10f3b3c8dc4691c9ae5a60e2e7bc4f72c943 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <pankajkumar.divedi at amd.com>
Date: Mon, 5 Jan 2026 15:21:24 +0530
Subject: [PATCH 07/10] Update SIInsertWaitcnts.cpp
Co-authored-by: Pierre van Houtryve <pierre.vanhoutryve at amd.com>
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index d189733171bb1..0fb3692ff00ac 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1574,7 +1574,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
// Helper to emit expanded waitcnt sequence for profiling.
// Emits waitcnts from (Outstanding-1) down to Target, or just Target if
// nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
- auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+ auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
auto EmitWaitcnt) {
if (Outstanding > Target) {
for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
>From 62d35ccc7a0d5520e17daf0454a774a54622a680 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <pankajkumar.divedi at amd.com>
Date: Mon, 5 Jan 2026 15:21:32 +0530
Subject: [PATCH 08/10] Update SIInsertWaitcnts.cpp
Co-authored-by: Pierre van Houtryve <pierre.vanhoutryve at amd.com>
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0fb3692ff00ac..9df3452753e4c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1905,7 +1905,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
const DebugLoc &DL = Block.findDebugLoc(It);
// Helper to emit expanded waitcnt sequence for profiling.
- auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+ auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
auto EmitWaitcnt) {
if (Outstanding > Target) {
for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
>From 7196fc6783868a5eaef3ac9dec46e8978fa65e87 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 5 Jan 2026 21:17:58 +0530
Subject: [PATCH 09/10] review: move hardwareLimit inside AMDGPUBaseInfo
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 77 +++++++------------
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 17 ++++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 18 +++++
3 files changed, 63 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9df3452753e4c..90e262a9b9269 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -104,17 +104,6 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
using RegInterval = std::pair<int, int>;
-struct HardwareLimits {
- unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
- unsigned ExpcntMax;
- unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
- unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
- unsigned SamplecntMax; // gfx12+ only.
- unsigned BvhcntMax; // gfx12+ only.
- unsigned KmcntMax; // gfx12+ only.
- unsigned XcntMax; // gfx1250.
-};
-
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
DECL(VMEM_ACCESS) /* vmem read & write */ \
DECL(VMEM_READ_ACCESS) /* vmem read */ \
@@ -289,42 +278,41 @@ class WaitcntGenerator {
AMDGPU::IsaVersion IV;
InstCounterType MaxCounter;
bool OptNone;
+ const AMDGPU::HardwareLimits *Limits = nullptr;
public:
WaitcntGenerator() = default;
- WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
+ WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
+ const AMDGPU::HardwareLimits *Limits)
: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
OptNone(MF.getFunction().hasOptNone() ||
- MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
+ MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
+ Limits(Limits) {}
// Return true if the current function should be compiled with no
// optimization.
bool isOptNone() const { return OptNone; }
// Get the maximum wait count value for a given counter type.
- // For pre-GFX12, LOAD_CNT uses vmcnt and DS_CNT uses lgkmcnt.
- // For GFX12+, LOAD_CNT uses loadcnt and DS_CNT uses dscnt.
unsigned getWaitCountMax(InstCounterType T) const {
switch (T) {
case LOAD_CNT:
- return ST->hasExtendedWaitCounts() ? AMDGPU::getLoadcntBitMask(IV)
- : AMDGPU::getVmcntBitMask(IV);
+ return Limits->LoadcntMax;
case DS_CNT:
- return ST->hasExtendedWaitCounts() ? AMDGPU::getDscntBitMask(IV)
- : AMDGPU::getLgkmcntBitMask(IV);
+ return Limits->DscntMax;
case EXP_CNT:
- return AMDGPU::getExpcntBitMask(IV);
+ return Limits->ExpcntMax;
case STORE_CNT:
- return AMDGPU::getStorecntBitMask(IV);
+ return Limits->StorecntMax;
case SAMPLE_CNT:
- return AMDGPU::getSamplecntBitMask(IV);
+ return Limits->SamplecntMax;
case BVH_CNT:
- return AMDGPU::getBvhcntBitMask(IV);
+ return Limits->BvhcntMax;
case KM_CNT:
- return AMDGPU::getKmcntBitMask(IV);
+ return Limits->KmcntMax;
case X_CNT:
- return AMDGPU::getXcntBitMask(IV);
+ return Limits->XcntMax;
default:
return 0;
}
@@ -381,8 +369,9 @@ class WaitcntGenerator {
class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
public:
WaitcntGeneratorPreGFX12() = default;
- WaitcntGeneratorPreGFX12(const MachineFunction &MF)
- : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
+ WaitcntGeneratorPreGFX12(const MachineFunction &MF,
+ const AMDGPU::HardwareLimits *Limits)
+ : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -419,8 +408,9 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
public:
WaitcntGeneratorGFX12Plus() = default;
WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
- InstCounterType MaxCounter)
- : WaitcntGenerator(MF, MaxCounter) {}
+ InstCounterType MaxCounter,
+ const AMDGPU::HardwareLimits *Limits)
+ : WaitcntGenerator(MF, MaxCounter, Limits) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -490,7 +480,7 @@ class SIInsertWaitcnts {
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
- HardwareLimits Limits;
+ AMDGPU::HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -1620,7 +1610,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
ScoreBrackets->getScoreLB(CT),
getWaitCountMax(CT) - 1);
- emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
+ EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
AMDGPU::Waitcnt W;
getCounterRef(W, CT) = Count;
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
@@ -1650,7 +1640,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
ScoreBrackets->getScoreLB(STORE_CNT),
getWaitCountMax(STORE_CNT) - 1);
- emitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
+ EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Count);
@@ -1937,7 +1927,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
ScoreBrackets->getScoreLB(CT),
getWaitCountMax(CT) - 1);
- emitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
+ EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
.addImm(Val);
});
@@ -2891,13 +2881,16 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+ // Initialize hardware limits first, as they're needed by the generators.
+ Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts());
+
if (ST->hasExtendedWaitCounts()) {
MaxCounter = NUM_EXTENDED_INST_CNTS;
- WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
+ WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits);
WCG = &WCGGFX12Plus;
} else {
MaxCounter = NUM_NORMAL_INST_CNTS;
- WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
+ WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits);
WCG = &WCGPreGFX12;
}
@@ -2908,20 +2901,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
- if (ST->hasExtendedWaitCounts()) {
- Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
- Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
- } else {
- Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
- Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
- }
- Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
- Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
- Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
- Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
- Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
- Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
-
[[maybe_unused]] unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
[[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 90f0b49ab9a78..096c22990c76d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1752,6 +1752,23 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
+HardwareLimits::HardwareLimits(const IsaVersion &IV,
+ bool HasExtendedWaitCounts) {
+ if (HasExtendedWaitCounts) {
+ LoadcntMax = getLoadcntBitMask(IV);
+ DscntMax = getDscntBitMask(IV);
+ } else {
+ LoadcntMax = getVmcntBitMask(IV);
+ DscntMax = getLgkmcntBitMask(IV);
+ }
+ ExpcntMax = getExpcntBitMask(IV);
+ StorecntMax = getStorecntBitMask(IV);
+ SamplecntMax = getSamplecntBitMask(IV);
+ BvhcntMax = getBvhcntBitMask(IV);
+ KmcntMax = getKmcntBitMask(IV);
+ XcntMax = getXcntBitMask(IV);
+}
+
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
getVmcntBitWidthLo(Version.Major));
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3a352006e006c..6c0cb064357ef 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1121,6 +1121,24 @@ struct Waitcnt {
}
};
+/// Represents the hardware counter limits for different wait count types.
+struct HardwareLimits {
+ unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
+ unsigned ExpcntMax;
+ unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
+ unsigned SamplecntMax; // gfx12+ only.
+ unsigned BvhcntMax; // gfx12+ only.
+ unsigned KmcntMax; // gfx12+ only.
+ unsigned XcntMax; // gfx1250.
+
+ HardwareLimits() = default;
+
+ /// Initializes hardware limits from ISA version.
+ /// \p HasExtendedWaitCounts should be true for gfx12+.
+ HardwareLimits(const IsaVersion &IV, bool HasExtendedWaitCounts);
+};
+
// The following methods are only meaningful on targets that support
// S_WAITCNT.
>From 7fa0bd2c6344ea9c2b635977f4d3baab762f7d44 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 5 Jan 2026 21:57:31 +0530
Subject: [PATCH 10/10] review: use function attr instead cl::opt flag
---
clang/include/clang/Basic/CodeGenOptions.def | 4 +++
clang/include/clang/Options/Options.td | 7 ++++
clang/lib/CodeGen/Targets/AMDGPU.cpp | 2 ++
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 ++---
.../AMDGPU/expand-waitcnt-profiling.ll | 32 ++++++++++---------
5 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 76a6463881c6f..ade29f82d4e85 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -459,6 +459,10 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign)
/// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only)
CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign)
+/// Enable expanded waitcnt for profiling (AMDGPU Only)
+/// Expands s_waitcnt instructions to help PC-sampling profilers identify stalls.
+CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign)
+
// Whether to emit Swift Async function extended frame information: auto,
// never, always.
ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index a8fc1c4326cc5..dc2ea02a22595 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5501,6 +5501,13 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee",
"This option changes the ABI. (AMDGPU only)">,
NegFlag<SetFalse, [], [ClangOption, CC1Option]>>;
+defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profiling",
+ CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt instructions to help "
+ "PC-sampling profilers identify memory stalls. Instead of a single waitcnt(target), "
+ "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
+ NegFlag<SetFalse, [], [ClangOption]>>;
+
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index e4ad078dab197..f3f2670627849 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -446,6 +446,8 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
}
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
F->addFnAttr("amdgpu-ieee", "false");
+ if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
+ F->addFnAttr("amdgpu-expand-waitcnt-profiling");
}
unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 90e262a9b9269..649cea2cb92ed 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,11 +63,6 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
-static cl::opt<bool> ExpandWaitcntProfiling(
- "amdgpu-expand-waitcnt-profiling",
- cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false),
- cl::Hidden);
-
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -278,6 +273,7 @@ class WaitcntGenerator {
AMDGPU::IsaVersion IV;
InstCounterType MaxCounter;
bool OptNone;
+ bool ExpandWaitcntProfiling = false;
const AMDGPU::HardwareLimits *Limits = nullptr;
public:
@@ -288,6 +284,8 @@ class WaitcntGenerator {
IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
OptNone(MF.getFunction().hasOptNone() ||
MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
+ ExpandWaitcntProfiling(
+ MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
Limits(Limits) {}
// Return true if the current function should be compiled with no
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index b15b58f6a58d8..848a9d07084ed 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX9-EXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9-NOEXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX10-EXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10-NOEXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX11-EXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11-NOEXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX12-EXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s
; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding
; operations, instead of emitting a single waitcnt(target), we emit:
@@ -15,7 +15,7 @@
; This allows PC-sampling profilers to identify which specific operation
; is causing a stall by observing where the program counter is stuck.
-define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -145,7 +145,7 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt
ret void
}
-define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_vmcnt_global_loads:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -299,7 +299,7 @@ define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr ad
declare i32 @llvm.amdgcn.workitem.id.x()
-define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
@@ -446,7 +446,7 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
ret void
}
-define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -577,7 +577,7 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_
; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete
; out-of-order relative to each other. When both are in-flight, we should NOT expand
; because the expansion would be misleading.
-define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
@@ -716,7 +716,7 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
ret void
}
-define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) {
+define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 {
; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+)
; GFX9-EXPAND-LABEL: test_vscnt_global_stores:
; GFX9-EXPAND: ; %bb.0: ; %entry
@@ -856,7 +856,7 @@ entry:
ret void
}
-define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) {
+define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 {
; Test export operations (EXP_CNT/expcnt)
; GFX9-EXPAND-LABEL: test_expcnt_exports:
; GFX9-EXPAND: ; %bb.0: ; %entry
@@ -940,3 +940,5 @@ entry:
}
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
+
+attributes #0 = { nounwind ATTRS }
More information about the cfe-commits
mailing list