[clang] [llvm] [AMDGPU] Implement Waitcnt Expansion for Profiling (PR #169345)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 24 09:49:38 PST 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/169345
>From beb404722561291859b6bcd7c0615ea7616967d2 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Nov 2025 21:00:58 +0530
Subject: [PATCH 1/2] Implement compiler option
-mamdgpu-expand-waitcnt-profiling to expand waitcnt instruction
---
clang/include/clang/Driver/Options.td | 5 +-
clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 +
clang/test/Driver/amdgpu-features.c | 6 +
llvm/lib/Target/AMDGPU/AMDGPU.td | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 +
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 165 ++++++++++++
.../AMDGPU/expand-waitcnt-profiling.ll | 239 ++++++++++++++++++
7 files changed, 427 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 11e81e032d5fc..c0ba716484b6a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5497,7 +5497,10 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
" mode (AMDGPU only)">;
defm amdgpu_precise_memory_op
: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
- " precise memory mode (AMDGPU only)">;
+ " precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>;
+defm amdgpu_expand_waitcnt_profiling
+ : SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable",
+ " waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>;
def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">,
Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 1a243fef9532d..f4ddb48c9abc6 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -700,6 +700,10 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
options::OPT_mno_amdgpu_precise_memory_op, false))
Features.push_back("+precise-memory");
+ if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling,
+ options::OPT_mno_amdgpu_expand_waitcnt_profiling, false))
+ Features.push_back("+expand-waitcnt-profiling");
+
handleTargetFeaturesGroup(D, Triple, Args, Features,
options::OPT_m_amdgpu_Features_Group);
}
diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index 864744db203e9..16b3f4121ab7a 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -38,3 +38,9 @@
// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
// NO-PREC-MEM-NOT: {{".*precise-memory"}}
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s
+// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling"
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s
+// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..3f9166f48ea22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -223,6 +223,10 @@ def FeaturePreciseMemory
: SubtargetFeature<"precise-memory", "EnablePreciseMemory",
"true", "Enable precise memory mode">;
+def FeatureExpandWaitcntProfiling
+ : SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling",
+ "true", "Expand waitcnt instructions for profiling">;
+
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f377b8aaf1333..f2b885a790f41 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -90,6 +90,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool EnableCuMode = false;
bool TrapHandler = false;
bool EnablePreciseMemory = false;
+ bool EnableExpandWaitcntProfiling = false;
// Used as options.
bool EnableLoadStoreOpt = false;
@@ -674,6 +675,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
+ bool isExpandWaitcntProfilingEnabled() const {
+ return EnableExpandWaitcntProfiling;
+ }
+
bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7fa899678ec7..4a70479358bad 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -494,6 +494,16 @@ class SIInsertWaitcnts {
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
bool run(MachineFunction &MF);
+ // Methods for expanding waitcnt instructions for profiling
+ bool expandWaitcntsForProfiling(MachineFunction &MF);
+ bool expandSingleWaitcnt(MachineInstr &MI, MachineBasicBlock &MBB);
+ bool expandSingleCounterWait(MachineInstr &MI, MachineBasicBlock &MBB,
+ InstCounterType CT);
+ bool expandCounterSequence(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPos,
+ InstCounterType CT, unsigned CountValue,
+ DebugLoc DL);
+
void setForceEmitWaitcnt() {
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
// For debug builds, get the debug counter info and adjust if need be
@@ -2725,6 +2735,156 @@ SIInsertWaitcntsPass::run(MachineFunction &MF,
.preserve<AAManager>();
}
+/// Expand waitcnt instructions for profiling by inserting a sequence of
+/// decreasing counter values. This helps identify which specific memory
+/// operation is a bottleneck during PC sampling.
+bool SIInsertWaitcnts::expandWaitcntsForProfiling(MachineFunction &MF) {
+ if (!ST->isExpandWaitcntProfilingEnabled())
+ return false;
+
+ bool Modified = false;
+
+ // Iterate through all basic blocks
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
+ MachineInstr &MI = *I;
+ ++I; // Advance iterator before potential expansion
+
+ if (ST->hasExtendedWaitCounts()) {
+ // GFX12+: Handle separate wait instructions
+ if (auto CT = counterTypeForInstr(MI.getOpcode())) {
+ Modified |= expandSingleCounterWait(MI, MBB, *CT);
+ }
+ } else {
+ // Pre-GFX12: Handle combined S_WAITCNT
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT) {
+ Modified |= expandSingleWaitcnt(MI, MBB);
+ }
+ }
+ }
+ }
+
+ return Modified;
+}
+
+/// Expand a single S_WAITCNT instruction (pre-GFX12)
+bool SIInsertWaitcnts::expandSingleWaitcnt(MachineInstr &MI,
+ MachineBasicBlock &MBB) {
+ assert(MI.getOpcode() == AMDGPU::S_WAITCNT);
+
+ // Decode the waitcnt immediate
+ unsigned Imm = MI.getOperand(0).getImm();
+ AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+ AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt(IV, Imm);
+
+ // Insert expanded waitcnts BEFORE the original instruction
+ auto InsertPos = MI.getIterator();
+ DebugLoc DL = MI.getDebugLoc();
+
+ bool Modified = false;
+
+ // Expand each counter independently
+ // For independent counters (Case 2 from requirements):
+ // vmcnt and lgkmcnt can be separated
+ Modified |= expandCounterSequence(MBB, InsertPos, LOAD_CNT, Wait.LoadCnt, DL);
+ Modified |= expandCounterSequence(MBB, InsertPos, DS_CNT, Wait.DsCnt, DL);
+ Modified |= expandCounterSequence(MBB, InsertPos, EXP_CNT, Wait.ExpCnt, DL);
+ Modified |=
+ expandCounterSequence(MBB, InsertPos, STORE_CNT, Wait.StoreCnt, DL);
+
+ // If we expanded anything, remove the original waitcnt
+ if (Modified) {
+ MI.eraseFromParent();
+ }
+
+ return Modified;
+}
+
+/// Expand a single counter wait instruction (GFX12+)
+bool SIInsertWaitcnts::expandSingleCounterWait(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ InstCounterType CT) {
+ // Get the counter value from the instruction
+ unsigned CountValue = MI.getOperand(0).getImm();
+
+ // Insert expanded waitcnts BEFORE the original instruction
+ auto InsertPos = MI.getIterator();
+ DebugLoc DL = MI.getDebugLoc();
+
+ bool Modified = expandCounterSequence(MBB, InsertPos, CT, CountValue, DL);
+
+ // If we expanded, remove the original instruction
+ if (Modified) {
+ MI.eraseFromParent();
+ }
+
+ return Modified;
+}
+
+/// Insert a sequence of wait instructions with decreasing counter values
+bool SIInsertWaitcnts::expandCounterSequence(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos,
+ InstCounterType CT, unsigned CountValue, DebugLoc DL) {
+ // Skip if counter is already at zero, not active, or at max (wait not needed)
+ if (CountValue == 0 || CountValue == ~0u)
+ return false;
+
+ unsigned MaxCount = getWaitCountMax(CT);
+ if (CountValue >= MaxCount)
+ return false;
+
+ bool Modified = false;
+
+ // Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0
+ // We start from CountValue-1 because the original waitcnt already handles
+ // CountValue
+ for (int i = CountValue - 1; i >= 0; --i) {
+ if (ST->hasExtendedWaitCounts()) {
+ // GFX12+: Use separate wait instructions
+ unsigned Opcode = instrsForExtendedCounterTypes[CT];
+ BuildMI(MBB, InsertPos, DL, TII->get(Opcode)).addImm(i);
+ } else {
+ // Pre-GFX12: Use combined S_WAITCNT with only this counter set
+ AMDGPU::Waitcnt Wait;
+ switch (CT) {
+ case LOAD_CNT:
+ Wait.LoadCnt = i;
+ break;
+ case DS_CNT:
+ Wait.DsCnt = i;
+ break;
+ case EXP_CNT:
+ Wait.ExpCnt = i;
+ break;
+ case STORE_CNT:
+ Wait.StoreCnt = i;
+ break;
+ case SAMPLE_CNT:
+ Wait.SampleCnt = i;
+ break;
+ case BVH_CNT:
+ Wait.BvhCnt = i;
+ break;
+ case KM_CNT:
+ Wait.KmCnt = i;
+ break;
+ case X_CNT:
+ Wait.XCnt = i;
+ break;
+ default:
+ break;
+ }
+
+ AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(MBB, InsertPos, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ }
+ Modified = true;
+ }
+
+ return Modified;
+}
+
bool SIInsertWaitcnts::run(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
@@ -2963,5 +3123,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
PreheadersToFlush.clear();
SLoadAddresses.clear();
+ // Expand waitcnts for profiling if requested
+ if (ST->isExpandWaitcntProfilingEnabled()) {
+ Modified |= expandWaitcntsForProfiling(MF);
+ }
+
return Modified;
}
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
new file mode 100644
index 0000000000000..cc99c457677ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -0,0 +1,239 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
+
+; NOTE: These simple test cases are optimized to generate waitcnt(0) by the
+; time values are needed. The expansion feature correctly does NOT expand waitcnt(0).
+
+; Pattern: Multiple scalar loads that increment lgkmcnt, followed by use
+; Expected on real kernels with non-zero lgkmcnt:
+; WITHOUT expansion: s_waitcnt lgkmcnt(0)
+; WITH expansion: s_waitcnt lgkmcnt(2)
+; s_waitcnt lgkmcnt(1)
+; s_waitcnt lgkmcnt(0)
+
+define amdgpu_kernel void @case1_single_counter_lgkmcnt(
+; EXPAND-LABEL: case1_single_counter_lgkmcnt:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_add_i32 s0, s0, s1
+; EXPAND-NEXT: s_add_i32 s0, s0, s2
+; EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case1_single_counter_lgkmcnt:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
+; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
+; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_add_i32 s0, s0, s1
+; NOEXPAND-NEXT: s_add_i32 s0, s0, s2
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(4) %ptr_a,
+ ptr addrspace(4) %ptr_b,
+ ptr addrspace(4) %ptr_c,
+ ptr addrspace(1) %out) {
+ ; Three scalar loads - increment lgkmcnt
+ %val_a = load i32, ptr addrspace(4) %ptr_a, align 4
+ %val_b = load i32, ptr addrspace(4) %ptr_b, align 4
+ %val_c = load i32, ptr addrspace(4) %ptr_c, align 4
+
+ ; Use all three values
+ %sum1 = add i32 %val_a, %val_b
+ %sum2 = add i32 %sum1, %val_c
+
+ store i32 %sum2, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; Pattern: Global load (vmcnt) and scalar load (lgkmcnt) can be separated
+; Expected on real kernels with non-zero counters:
+; WITHOUT expansion: s_waitcnt vmcnt(0) lgkmcnt(0)
+; WITH expansion: s_waitcnt vmcnt(0)
+; s_waitcnt lgkmcnt(0)
+
+define amdgpu_kernel void @case2_independent_counters(
+; EXPAND-LABEL: case2_independent_counters:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: s_add_i32 s0, s4, s5
+; EXPAND-NEXT: v_mov_b32_e32 v1, s0
+; EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case2_independent_counters:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
+; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: s_add_i32 s0, s4, s5
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(1) %global_ptr,
+ ptr addrspace(4) %scalar_ptr,
+ ptr addrspace(1) %out) {
+ ; Global memory load - increments vmcnt
+ %global_val = load i32, ptr addrspace(1) %global_ptr, align 4
+
+ ; Scalar memory load - increments lgkmcnt
+ %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4
+
+ ; Use both values - compiler must wait for both counters
+ %result = add i32 %global_val, %scalar_val
+
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; Pattern: Multiple buffer stores followed by a load (all affect vmcnt)
+; Expected on real kernels with many stores (e.g., 12 stores):
+; WITHOUT expansion: s_waitcnt vmcnt(0)
+; WITH expansion: s_waitcnt vmcnt(11)
+; s_waitcnt vmcnt(10)
+; ...
+; s_waitcnt vmcnt(1)
+; s_waitcnt vmcnt(0)
+
+define amdgpu_kernel void @case3_overlapping_counters(
+; EXPAND-LABEL: case3_overlapping_counters:
+; EXPAND: ; %bb.0:
+; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT: v_mov_b32_e32 v0, 0
+; EXPAND-NEXT: v_mov_b32_e32 v1, 1
+; EXPAND-NEXT: v_mov_b32_e32 v2, 2
+; EXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: s_add_u32 s2, s2, s6
+; EXPAND-NEXT: s_addc_u32 s3, s3, s7
+; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
+; EXPAND-NEXT: s_waitcnt vmcnt(0)
+; EXPAND-NEXT: s_endpgm
+;
+; NOEXPAND-LABEL: case3_overlapping_counters:
+; NOEXPAND: ; %bb.0:
+; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
+; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
+; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: s_add_u32 s2, s2, s6
+; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7
+; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48
+; NOEXPAND-NEXT: s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT: s_endpgm
+ ptr addrspace(1) %buf,
+ ptr addrspace(1) %data,
+ i64 %offset) {
+ ; Issue 12 stores to buffer - each increments vmcnt
+ %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0
+ store volatile i32 1, ptr addrspace(1) %ptr0, align 4
+ %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1
+ store volatile i32 2, ptr addrspace(1) %ptr1, align 4
+ %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2
+ store volatile i32 1, ptr addrspace(1) %ptr2, align 4
+ %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3
+ store volatile i32 2, ptr addrspace(1) %ptr3, align 4
+ %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4
+ store volatile i32 1, ptr addrspace(1) %ptr4, align 4
+ %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5
+ store volatile i32 2, ptr addrspace(1) %ptr5, align 4
+ %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6
+ store volatile i32 1, ptr addrspace(1) %ptr6, align 4
+ %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7
+ store volatile i32 2, ptr addrspace(1) %ptr7, align 4
+ %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8
+ store volatile i32 1, ptr addrspace(1) %ptr8, align 4
+ %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9
+ store volatile i32 2, ptr addrspace(1) %ptr9, align 4
+ %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10
+ store volatile i32 1, ptr addrspace(1) %ptr10, align 4
+ %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11
+ store volatile i32 2, ptr addrspace(1) %ptr11, align 4
+
+ ; Load from potentially aliasing address - also increments vmcnt
+ %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset
+ %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4
+
+ ; Store the loaded value
+ %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12
+ store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4
+
+ ret void
+}
>From 532c866dcc8079489eb60fff37d2c72cc6310687 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Nov 2025 23:19:22 +0530
Subject: [PATCH 2/2] address review: remove subtarget integration
---
clang/include/clang/Driver/Options.td | 5 +----
clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 ----
clang/test/Driver/amdgpu-features.c | 6 ------
llvm/lib/Target/AMDGPU/AMDGPU.td | 4 ----
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 -----
5 files changed, 1 insertion(+), 23 deletions(-)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c0ba716484b6a..11e81e032d5fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5497,10 +5497,7 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
" mode (AMDGPU only)">;
defm amdgpu_precise_memory_op
: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
- " precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>;
-defm amdgpu_expand_waitcnt_profiling
- : SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable",
- " waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>;
+ " precise memory mode (AMDGPU only)">;
def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">,
Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index f4ddb48c9abc6..1a243fef9532d 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -700,10 +700,6 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
options::OPT_mno_amdgpu_precise_memory_op, false))
Features.push_back("+precise-memory");
- if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling,
- options::OPT_mno_amdgpu_expand_waitcnt_profiling, false))
- Features.push_back("+expand-waitcnt-profiling");
-
handleTargetFeaturesGroup(D, Triple, Args, Features,
options::OPT_m_amdgpu_Features_Group);
}
diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index 16b3f4121ab7a..864744db203e9 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -38,9 +38,3 @@
// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
// NO-PREC-MEM-NOT: {{".*precise-memory"}}
-
-// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s
-// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling"
-
-// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s
-// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 3f9166f48ea22..54d94b1f8682e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -223,10 +223,6 @@ def FeaturePreciseMemory
: SubtargetFeature<"precise-memory", "EnablePreciseMemory",
"true", "Enable precise memory mode">;
-def FeatureExpandWaitcntProfiling
- : SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling",
- "true", "Expand waitcnt instructions for profiling">;
-
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f2b885a790f41..f377b8aaf1333 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -90,7 +90,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool EnableCuMode = false;
bool TrapHandler = false;
bool EnablePreciseMemory = false;
- bool EnableExpandWaitcntProfiling = false;
// Used as options.
bool EnableLoadStoreOpt = false;
@@ -675,10 +674,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
- bool isExpandWaitcntProfilingEnabled() const {
- return EnableExpandWaitcntProfiling;
- }
-
bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
More information about the llvm-commits
mailing list