[llvm] [AMDGPU] Implement Waitcnt Expansion for Profiling (PR #169345)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 25 09:02:36 PST 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/169345
>From 0745347cec016868cad1f1e5eaab10489918bc98 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Nov 2025 22:23:25 +0530
Subject: [PATCH] [AMDGPU] Add -amdgpu-expand-waitcnt-profiling option for
PC-sampling profiling
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 221 ++++++++++++++---
.../AMDGPU/expand-waitcnt-profiling.ll | 230 ++++++++++++++++++
2 files changed, 422 insertions(+), 29 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7fa899678ec7..05baf59bd43fa 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,6 +63,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> ExpandWaitcntProfiling(
+ "amdgpu-expand-waitcnt-profiling",
+ cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false),
+ cl::Hidden);
+
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -297,6 +302,30 @@ class WaitcntGenerator {
// optimization.
bool isOptNone() const { return OptNone; }
+ // Get the maximum wait count value for a given counter type
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return AMDGPU::getLoadcntBitMask(IV);
+ case DS_CNT:
+ return AMDGPU::getDscntBitMask(IV);
+ case EXP_CNT:
+ return AMDGPU::getExpcntBitMask(IV);
+ case STORE_CNT:
+ return AMDGPU::getStorecntBitMask(IV);
+ case SAMPLE_CNT:
+ return AMDGPU::getSamplecntBitMask(IV);
+ case BVH_CNT:
+ return AMDGPU::getBvhcntBitMask(IV);
+ case KM_CNT:
+ return AMDGPU::getKmcntBitMask(IV);
+ case X_CNT:
+ return 0; // No hardware limit for XCNT
+ default:
+ return 0;
+ }
+ }
+
// Edits an existing sequence of wait count instructions according
// to an incoming Waitcnt value, which is itself updated to reflect
// any new wait count instructions which may need to be generated by
@@ -318,9 +347,11 @@ class WaitcntGenerator {
// Generates new wait count instructions according to the value of
// Wait, returning true if any new instructions were created.
+ // If ScoreBrackets is provided, it can be used for profiling expansion.
virtual bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) = 0;
+ AMDGPU::Waitcnt Wait,
+ WaitcntBrackets *ScoreBrackets = nullptr) = 0;
// Returns an array of bit masks which can be used to map values in
// WaitEventType to corresponding counter values in InstCounterType.
@@ -356,7 +387,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
+ AMDGPU::Waitcnt Wait,
+ WaitcntBrackets *ScoreBrackets = nullptr) override;
const unsigned *getWaitEventMask() const override {
assert(ST);
@@ -393,7 +425,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) override;
+ AMDGPU::Waitcnt Wait,
+ WaitcntBrackets *ScoreBrackets = nullptr) override;
const unsigned *getWaitEventMask() const override {
assert(ST);
@@ -1523,7 +1556,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
/// required counters in \p Wait
bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
+ AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
assert(ST);
assert(isNormalMode(MaxCounter));
@@ -1533,28 +1566,125 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
// single instruction while VScnt has its own instruction.
if (Wait.hasWaitExceptStoreCnt()) {
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Modified = true;
+ // If profiling expansion is enabled and we have score brackets,
+ // emit an expanded sequence
+ if (ExpandWaitcntProfiling && ScoreBrackets) {
+ // Emit expansion for each active counter
+ if (Wait.LoadCnt != ~0u) {
+ unsigned UB = ScoreBrackets->getScoreUB(LOAD_CNT);
+ unsigned LB = ScoreBrackets->getScoreLB(LOAD_CNT);
+ unsigned Outstanding = std::min(UB - LB, getWaitCountMax(LOAD_CNT) - 1);
+ // Start at Outstanding - 1 since waitcnt(Outstanding) is a no-op
+ if (Outstanding > Wait.LoadCnt) {
+ for (unsigned i = Outstanding - 1; i >= Wait.LoadCnt && i != ~0u;
+ --i) {
+ AMDGPU::Waitcnt ExpandWait;
+ ExpandWait.LoadCnt = i;
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ }
+ } else {
+ // Nothing to expand, just emit the target waitcnt
+ AMDGPU::Waitcnt ExpandWait;
+ ExpandWait.LoadCnt = Wait.LoadCnt;
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ }
+ }
+ if (Wait.DsCnt != ~0u) {
+ unsigned UB = ScoreBrackets->getScoreUB(DS_CNT);
+ unsigned LB = ScoreBrackets->getScoreLB(DS_CNT);
+ unsigned Outstanding = std::min(UB - LB, getWaitCountMax(DS_CNT) - 1);
+ // Start at Outstanding - 1 since waitcnt(Outstanding) is a no-op
+ if (Outstanding > Wait.DsCnt) {
+ for (unsigned i = Outstanding - 1; i >= Wait.DsCnt && i != ~0u; --i) {
+ AMDGPU::Waitcnt ExpandWait;
+ ExpandWait.DsCnt = i;
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ }
+ } else {
+ // Nothing to expand, just emit the target waitcnt
+ AMDGPU::Waitcnt ExpandWait;
+ ExpandWait.DsCnt = Wait.DsCnt;
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ }
+ }
+ if (Wait.ExpCnt != ~0u) {
+ unsigned UB = ScoreBrackets->getScoreUB(EXP_CNT);
+ unsigned LB = ScoreBrackets->getScoreLB(EXP_CNT);
+ unsigned Outstanding = std::min(UB - LB, getWaitCountMax(EXP_CNT) - 1);
+ // Start at Outstanding - 1 since waitcnt(Outstanding) is a no-op
+ if (Outstanding > Wait.ExpCnt) {
+ for (unsigned i = Outstanding - 1; i >= Wait.ExpCnt && i != ~0u;
+ --i) {
+ AMDGPU::Waitcnt ExpandWait;
+ ExpandWait.ExpCnt = i;
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ }
+ } else {
+ // Nothing to expand, just emit the target waitcnt
+ AMDGPU::Waitcnt ExpandWait;
+ ExpandWait.ExpCnt = Wait.ExpCnt;
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+ }
+ }
+ } else {
+ // Normal behavior: emit single combined waitcnt
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
}
if (Wait.hasWaitStoreCnt()) {
assert(ST->hasVscnt());
- [[maybe_unused]] auto SWaitInst =
+ if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) {
+ unsigned UB = ScoreBrackets->getScoreUB(STORE_CNT);
+ unsigned LB = ScoreBrackets->getScoreLB(STORE_CNT);
+ unsigned Outstanding = std::min(UB - LB, getWaitCountMax(STORE_CNT) - 1);
+ // Start at Outstanding - 1 since waitcnt(Outstanding) is a no-op
+ if (Outstanding > Wait.StoreCnt) {
+ for (unsigned i = Outstanding - 1; i >= Wait.StoreCnt && i != ~0u;
+ --i) {
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(i);
+ Modified = true;
+ }
+ } else {
+ // Nothing to expand, just emit the target waitcnt
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.StoreCnt);
- Modified = true;
+ Modified = true;
+ }
+ } else {
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.StoreCnt);
+ Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
}
return Modified;
@@ -1777,13 +1907,44 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait) {
+ AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
assert(ST);
assert(!isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
+ // For GFX12+, we use separate wait instructions, which makes expansion
+ // simpler
+ if (ExpandWaitcntProfiling && ScoreBrackets) {
+ // Emit expanded sequence for each active counter
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ unsigned Count = getWait(Wait, CT);
+ if (Count == ~0u)
+ continue;
+
+ unsigned UB = ScoreBrackets->getScoreUB(CT);
+ unsigned LB = ScoreBrackets->getScoreLB(CT);
+ unsigned Outstanding = std::min(UB - LB, getWaitCountMax(CT) - 1);
+
+ // Start at Outstanding - 1 since waitcnt(Outstanding) is a no-op
+ if (Outstanding > Count) {
+ for (unsigned i = Outstanding - 1; i >= Count && i != ~0u; --i) {
+ BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(i);
+ Modified = true;
+ }
+ } else {
+ // Nothing to expand, just emit the target waitcnt
+ BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Count);
+ Modified = true;
+ }
+ }
+ return Modified;
+ }
+
+ // Normal behavior (no expansion)
// Check for opportunities to use combined wait instructions.
if (Wait.DsCnt != ~0u) {
MachineInstr *SWaitInst = nullptr;
@@ -2141,9 +2302,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
Modified =
WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
- // Any counts that could have been applied to any existing waitcnt
- // instructions will have been done so, now deal with any remaining.
- ScoreBrackets.applyWaitcnt(Wait);
+ AMDGPU::Waitcnt WaitForScore = Wait;
// ExpCnt can be merged into VINTERP.
if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -2161,23 +2320,27 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
// XCnt may be already consumed by a load wait.
+ // When we wait for KmCnt==0, all SMEM operations (including address
+ // translations) are complete, so XCNT wait is redundant. When we wait for
+ // LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address
+ // translations are complete (since XCnt follows LoadCnt for loads). When the
+ // current instruction is a VMEM access, translations are in-order.
if (Wait.XCnt != ~0u) {
- if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+ if (Wait.KmCnt == 0)
Wait.XCnt = ~0u;
-
- if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+ else if (Wait.LoadCnt == 0 && Wait.XCnt == 0)
Wait.XCnt = ~0u;
-
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (isVmemAccess(*It))
+ else if (isVmemAccess(*It))
Wait.XCnt = ~0u;
}
- if (WCG->createNewWaitcnt(Block, It, Wait))
+ if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
Modified = true;
+ // Any counts that could have been applied to any existing waitcnt
+ // instructions will have been done so, now deal with any remaining.
+ ScoreBrackets.applyWaitcnt(WaitForScore);
+
return Modified;
}
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
new file mode 100644
index 0000000000000..3daf3142f2a96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
+
+; This test demonstrates the waitcnt expansion feature for PC-sampling profiling.
+; The expansion transforms a single waitcnt instruction into a sequence of waitcnts
+; with decreasing counter values to help identify which specific memory operation
+; is causing a bottleneck.
+;
+; The kernels below keep multiple memory operations in flight before each waitcnt
+; so that ScoreBrackets tracks a non-zero number of outstanding events. When
+; -amdgpu-expand-waitcnt-profiling is enabled, each combined wait is expanded
+; into a descending sequence (e.g. outstanding=3 emits lgkmcnt(2), (1), (0))
+; which lets PC-sampling attribute long-latency stalls to the specific operation.
+
+define amdgpu_kernel void @case1_single_counter_lgkmcnt(
+; EXPAND-LABEL: case1_single_counter_lgkmcnt:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(2)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_add_i32 s0, s0, s1
+; EXPAND-NEXT: s_add_i32 s0, s0, s2
+; EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case1_single_counter_lgkmcnt:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; NOEXPAND-NEXT: s_add_i32 s0, s0, s2
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(4) %ptr_a,
+ ptr addrspace(4) %ptr_b,
+ ptr addrspace(4) %ptr_c,
+ ptr addrspace(1) %out) {
+ ; Three scalar loads - increment lgkmcnt
+ %val_a = load i32, ptr addrspace(4) %ptr_a, align 4
+ %val_b = load i32, ptr addrspace(4) %ptr_b, align 4
+ %val_c = load i32, ptr addrspace(4) %ptr_c, align 4
+
+ ; Use all three values
+ %sum1 = add i32 %val_a, %val_b
+ %sum2 = add i32 %sum1, %val_c
+
+ store i32 %sum2, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @case2_independent_counters(
+; EXPAND-LABEL: case2_independent_counters:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_add_i32 s0, s4, s5
+; EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case2_independent_counters:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_add_i32 s0, s4, s5
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(1) %global_ptr,
+ ptr addrspace(4) %scalar_ptr,
+ ptr addrspace(1) %out) {
+ ; Global memory load - increments vmcnt
+ %global_val = load i32, ptr addrspace(1) %global_ptr, align 4
+
+ ; Scalar memory load - increments lgkmcnt
+ %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4
+
+ ; Use both values - compiler must wait for both counters
+ %result = add i32 %global_val, %scalar_val
+
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @case3_overlapping_counters(
+; EXPAND-LABEL: case3_overlapping_counters:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: v_mov_b32_e32 v1, 1
+; EXPAND-NEXT: v_mov_b32_e32 v2, 2
+; EXPAND-NEXT: s_waitcnt lgkmcnt(1)
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: s_add_u32 s2, s2, s6
+; EXPAND-NEXT: s_addc_u32 s3, s3, s7
+; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case3_overlapping_counters:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
+; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: s_add_u32 s2, s2, s6
+; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7
+; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(1) %buf,
+ ptr addrspace(1) %data,
+ i64 %offset) {
+ ; Issue 12 stores to buffer - each increments vmcnt
+ %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0
+ store volatile i32 1, ptr addrspace(1) %ptr0, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1
+ store volatile i32 2, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2
+ store volatile i32 1, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3
+ store volatile i32 2, ptr addrspace(1) %ptr3, align 4
+ %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4
+ store volatile i32 1, ptr addrspace(1) %ptr4, align 4
+ %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5
+ store volatile i32 2, ptr addrspace(1) %ptr5, align 4
+ %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6
+ store volatile i32 1, ptr addrspace(1) %ptr6, align 4
+ %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7
+ store volatile i32 2, ptr addrspace(1) %ptr7, align 4
+ %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8
+ store volatile i32 1, ptr addrspace(1) %ptr8, align 4
+ %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9
+ store volatile i32 2, ptr addrspace(1) %ptr9, align 4
+ %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10
+ store volatile i32 1, ptr addrspace(1) %ptr10, align 4
+ %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11
+ store volatile i32 2, ptr addrspace(1) %ptr11, align 4
+
+ ; Load from potentially aliasing address - also increments vmcnt
+ %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset
+ %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4
+
+ ; Store the loaded value
+ %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12
+ store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4
+
+ ret void
+}
More information about the llvm-commits
mailing list