[clang] [llvm] [AMDGPU] Implement Waitcnt Expansion for Profiling (PR #169345)

Mon Nov 24 10:37:40 PST 2025

https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/169345

>From beb404722561291859b6bcd7c0615ea7616967d2 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Nov 2025 21:00:58 +0530
Subject: [PATCH 1/3] Implement compiler option
 -mamdgpu-expand-waitcnt-profiling to expand waitcnt instruction

---
 clang/include/clang/Driver/Options.td         |   5 +-
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |   4 +
 clang/test/Driver/amdgpu-features.c           |   6 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   4 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   5 +
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 165 ++++++++++++
 .../AMDGPU/expand-waitcnt-profiling.ll        | 239 ++++++++++++++++++
 7 files changed, 427 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 11e81e032d5fc..c0ba716484b6a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5497,7 +5497,10 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
   " mode (AMDGPU only)">;
 defm amdgpu_precise_memory_op
     : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
-                  " precise memory mode (AMDGPU only)">;
+                  " precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>;
+defm amdgpu_expand_waitcnt_profiling
+    : SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable",
+                  " waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>;
 
 def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 1a243fef9532d..f4ddb48c9abc6 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -700,6 +700,10 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
                    options::OPT_mno_amdgpu_precise_memory_op, false))
     Features.push_back("+precise-memory");
 
+  if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling,
+                   options::OPT_mno_amdgpu_expand_waitcnt_profiling, false))
+    Features.push_back("+expand-waitcnt-profiling");
+
   handleTargetFeaturesGroup(D, Triple, Args, Features,
                             options::OPT_m_amdgpu_Features_Group);
 }
diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index 864744db203e9..16b3f4121ab7a 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -38,3 +38,9 @@
 
 // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
 // NO-PREC-MEM-NOT: {{".*precise-memory"}}
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s
+// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling"
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s
+// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..3f9166f48ea22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -223,6 +223,10 @@ def FeaturePreciseMemory
     : SubtargetFeature<"precise-memory", "EnablePreciseMemory",
                        "true", "Enable precise memory mode">;
 
+def FeatureExpandWaitcntProfiling
+    : SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling",
+                       "true", "Expand waitcnt instructions for profiling">;
+
 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "SGPRInitBug",
   "true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f377b8aaf1333..f2b885a790f41 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -90,6 +90,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool EnableCuMode = false;
   bool TrapHandler = false;
   bool EnablePreciseMemory = false;
+  bool EnableExpandWaitcntProfiling = false;
 
   // Used as options.
   bool EnableLoadStoreOpt = false;
@@ -674,6 +675,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
 
+  bool isExpandWaitcntProfilingEnabled() const {
+    return EnableExpandWaitcntProfiling;
+  }
+
   bool hasFlatAddressSpace() const {
     return FlatAddressSpace;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7fa899678ec7..4a70479358bad 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -494,6 +494,16 @@ class SIInsertWaitcnts {
   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
   bool run(MachineFunction &MF);
 
+  // Methods for expanding waitcnt instructions for profiling
+  bool expandWaitcntsForProfiling(MachineFunction &MF);
+  bool expandSingleWaitcnt(MachineInstr &MI, MachineBasicBlock &MBB);
+  bool expandSingleCounterWait(MachineInstr &MI, MachineBasicBlock &MBB,
+                               InstCounterType CT);
+  bool expandCounterSequence(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator InsertPos,
+                             InstCounterType CT, unsigned CountValue,
+                             DebugLoc DL);
+
   void setForceEmitWaitcnt() {
 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
 // For debug builds, get the debug counter info and adjust if need be
@@ -2725,6 +2735,156 @@ SIInsertWaitcntsPass::run(MachineFunction &MF,
       .preserve<AAManager>();
 }
 
+/// Expand waitcnt instructions for profiling by inserting a sequence of
+/// decreasing counter values. This helps identify which specific memory
+/// operation is a bottleneck during PC sampling.
+bool SIInsertWaitcnts::expandWaitcntsForProfiling(MachineFunction &MF) {
+  if (!ST->isExpandWaitcntProfilingEnabled())
+    return false;
+
+  bool Modified = false;
+
+  // Iterate through all basic blocks
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
+      MachineInstr &MI = *I;
+      ++I; // Advance iterator before potential expansion
+
+      if (ST->hasExtendedWaitCounts()) {
+        // GFX12+: Handle separate wait instructions
+        if (auto CT = counterTypeForInstr(MI.getOpcode())) {
+          Modified |= expandSingleCounterWait(MI, MBB, *CT);
+        }
+      } else {
+        // Pre-GFX12: Handle combined S_WAITCNT
+        if (MI.getOpcode() == AMDGPU::S_WAITCNT) {
+          Modified |= expandSingleWaitcnt(MI, MBB);
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+/// Expand a single S_WAITCNT instruction (pre-GFX12)
+bool SIInsertWaitcnts::expandSingleWaitcnt(MachineInstr &MI,
+                                           MachineBasicBlock &MBB) {
+  assert(MI.getOpcode() == AMDGPU::S_WAITCNT);
+
+  // Decode the waitcnt immediate
+  unsigned Imm = MI.getOperand(0).getImm();
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+  AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt(IV, Imm);
+
+  // Insert expanded waitcnts BEFORE the original instruction
+  auto InsertPos = MI.getIterator();
+  DebugLoc DL = MI.getDebugLoc();
+
+  bool Modified = false;
+
+  // Expand each counter independently
+  // For independent counters (Case 2 from requirements):
+  // vmcnt and lgkmcnt can be separated
+  Modified |= expandCounterSequence(MBB, InsertPos, LOAD_CNT, Wait.LoadCnt, DL);
+  Modified |= expandCounterSequence(MBB, InsertPos, DS_CNT, Wait.DsCnt, DL);
+  Modified |= expandCounterSequence(MBB, InsertPos, EXP_CNT, Wait.ExpCnt, DL);
+  Modified |=
+      expandCounterSequence(MBB, InsertPos, STORE_CNT, Wait.StoreCnt, DL);
+
+  // If we expanded anything, remove the original waitcnt
+  if (Modified) {
+    MI.eraseFromParent();
+  }
+
+  return Modified;
+}
+
+/// Expand a single counter wait instruction (GFX12+)
+bool SIInsertWaitcnts::expandSingleCounterWait(MachineInstr &MI,
+                                               MachineBasicBlock &MBB,
+                                               InstCounterType CT) {
+  // Get the counter value from the instruction
+  unsigned CountValue = MI.getOperand(0).getImm();
+
+  // Insert expanded waitcnts BEFORE the original instruction
+  auto InsertPos = MI.getIterator();
+  DebugLoc DL = MI.getDebugLoc();
+
+  bool Modified = expandCounterSequence(MBB, InsertPos, CT, CountValue, DL);
+
+  // If we expanded, remove the original instruction
+  if (Modified) {
+    MI.eraseFromParent();
+  }
+
+  return Modified;
+}
+
+/// Insert a sequence of wait instructions with decreasing counter values
+bool SIInsertWaitcnts::expandCounterSequence(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos,
+    InstCounterType CT, unsigned CountValue, DebugLoc DL) {
+  // Skip if counter is already at zero, not active, or at max (wait not needed)
+  if (CountValue == 0 || CountValue == ~0u)
+    return false;
+
+  unsigned MaxCount = getWaitCountMax(CT);
+  if (CountValue >= MaxCount)
+    return false;
+
+  bool Modified = false;
+
+  // Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0
+  // We start from CountValue-1 because the original waitcnt already handles
+  // CountValue
+  for (int i = CountValue - 1; i >= 0; --i) {
+    if (ST->hasExtendedWaitCounts()) {
+      // GFX12+: Use separate wait instructions
+      unsigned Opcode = instrsForExtendedCounterTypes[CT];
+      BuildMI(MBB, InsertPos, DL, TII->get(Opcode)).addImm(i);
+    } else {
+      // Pre-GFX12: Use combined S_WAITCNT with only this counter set
+      AMDGPU::Waitcnt Wait;
+      switch (CT) {
+      case LOAD_CNT:
+        Wait.LoadCnt = i;
+        break;
+      case DS_CNT:
+        Wait.DsCnt = i;
+        break;
+      case EXP_CNT:
+        Wait.ExpCnt = i;
+        break;
+      case STORE_CNT:
+        Wait.StoreCnt = i;
+        break;
+      case SAMPLE_CNT:
+        Wait.SampleCnt = i;
+        break;
+      case BVH_CNT:
+        Wait.BvhCnt = i;
+        break;
+      case KM_CNT:
+        Wait.KmCnt = i;
+        break;
+      case X_CNT:
+        Wait.XCnt = i;
+        break;
+      default:
+        break;
+      }
+
+      AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+      BuildMI(MBB, InsertPos, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+    }
+    Modified = true;
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::run(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
@@ -2963,5 +3123,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   PreheadersToFlush.clear();
   SLoadAddresses.clear();
 
+  // Expand waitcnts for profiling if requested
+  if (ST->isExpandWaitcntProfilingEnabled()) {
+    Modified |= expandWaitcntsForProfiling(MF);
+  }
+
   return Modified;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
new file mode 100644
index 0000000000000..cc99c457677ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -0,0 +1,239 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
+
+; NOTE: These simple test cases are optimized to generate waitcnt(0) by the
+; time values are needed. The expansion feature correctly does NOT expand waitcnt(0).
+
+; Pattern: Multiple scalar loads that increment lgkmcnt, followed by use
+; Expected on real kernels with non-zero lgkmcnt:
+;   WITHOUT expansion: s_waitcnt lgkmcnt(0)
+;   WITH expansion:    s_waitcnt lgkmcnt(2)
+;                      s_waitcnt lgkmcnt(1)
+;                      s_waitcnt lgkmcnt(0)
+
+define amdgpu_kernel void @case1_single_counter_lgkmcnt(
+; EXPAND-LABEL: case1_single_counter_lgkmcnt:
+; EXPAND:       ; %bb.0:
+; EXPAND-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT:    s_load_dword s0, s[8:9], 0x0
+; EXPAND-NEXT:    s_load_dword s1, s[10:11], 0x0
+; EXPAND-NEXT:    s_load_dword s2, s[12:13], 0x0
+; EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT:    s_add_i32 s0, s0, s1
+; EXPAND-NEXT:    s_add_i32 s0, s0, s2
+; EXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; EXPAND-NEXT:    global_store_dword v0, v1, s[14:15]
+; EXPAND-NEXT:    s_endpgm
+;
+; NOEXPAND-LABEL: case1_single_counter_lgkmcnt:
+; NOEXPAND:       ; %bb.0:
+; NOEXPAND-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT:    s_load_dword s0, s[8:9], 0x0
+; NOEXPAND-NEXT:    s_load_dword s1, s[10:11], 0x0
+; NOEXPAND-NEXT:    s_load_dword s2, s[12:13], 0x0
+; NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT:    s_add_i32 s0, s0, s1
+; NOEXPAND-NEXT:    s_add_i32 s0, s0, s2
+; NOEXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[14:15]
+; NOEXPAND-NEXT:    s_endpgm
+    ptr addrspace(4) %ptr_a,
+    ptr addrspace(4) %ptr_b,
+    ptr addrspace(4) %ptr_c,
+    ptr addrspace(1) %out) {
+  ; Three scalar loads - increment lgkmcnt
+  %val_a = load i32, ptr addrspace(4) %ptr_a, align 4
+  %val_b = load i32, ptr addrspace(4) %ptr_b, align 4
+  %val_c = load i32, ptr addrspace(4) %ptr_c, align 4
+
+  ; Use all three values
+  %sum1 = add i32 %val_a, %val_b
+  %sum2 = add i32 %sum1, %val_c
+
+  store i32 %sum2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Pattern: Global load (vmcnt) and scalar load (lgkmcnt) can be separated
+; Expected on real kernels with non-zero counters:
+;   WITHOUT expansion: s_waitcnt vmcnt(0) lgkmcnt(0)
+;   WITH expansion:    s_waitcnt vmcnt(0)
+;                      s_waitcnt lgkmcnt(0)
+
+define amdgpu_kernel void @case2_independent_counters(
+; EXPAND-LABEL: case2_independent_counters:
+; EXPAND:       ; %bb.0:
+; EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT:    s_load_dword s4, s[0:1], 0x0
+; EXPAND-NEXT:    s_load_dword s5, s[2:3], 0x0
+; EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT:    s_add_i32 s0, s4, s5
+; EXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; EXPAND-NEXT:    global_store_dword v0, v1, s[6:7]
+; EXPAND-NEXT:    s_endpgm
+;
+; NOEXPAND-LABEL: case2_independent_counters:
+; NOEXPAND:       ; %bb.0:
+; NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT:    s_load_dword s4, s[0:1], 0x0
+; NOEXPAND-NEXT:    s_load_dword s5, s[2:3], 0x0
+; NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT:    s_add_i32 s0, s4, s5
+; NOEXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[6:7]
+; NOEXPAND-NEXT:    s_endpgm
+    ptr addrspace(1) %global_ptr,
+    ptr addrspace(4) %scalar_ptr,
+    ptr addrspace(1) %out) {
+  ; Global memory load - increments vmcnt
+  %global_val = load i32, ptr addrspace(1) %global_ptr, align 4
+
+  ; Scalar memory load - increments lgkmcnt
+  %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4
+
+  ; Use both values - compiler must wait for both counters
+  %result = add i32 %global_val, %scalar_val
+
+  store i32 %result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Pattern: Multiple buffer stores followed by a load (all affect vmcnt)
+; Expected on real kernels with many stores (e.g., 12 stores):
+;   WITHOUT expansion: s_waitcnt vmcnt(0)
+;   WITH expansion:    s_waitcnt vmcnt(11)
+;                      s_waitcnt vmcnt(10)
+;                      ...
+;                      s_waitcnt vmcnt(1)
+;                      s_waitcnt vmcnt(0)
+
+define amdgpu_kernel void @case3_overlapping_counters(
+; EXPAND-LABEL: case3_overlapping_counters:
+; EXPAND:       ; %bb.0:
+; EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; EXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; EXPAND-NEXT:    v_mov_b32_e32 v1, 1
+; EXPAND-NEXT:    v_mov_b32_e32 v2, 2
+; EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1]
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:4
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:8
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:12
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:16
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:20
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:24
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:28
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:32
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:36
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:40
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:44
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    s_add_u32 s2, s2, s6
+; EXPAND-NEXT:    s_addc_u32 s3, s3, s7
+; EXPAND-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:48
+; EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; EXPAND-NEXT:    s_endpgm
+;
+; NOEXPAND-LABEL: case3_overlapping_counters:
+; NOEXPAND:       ; %bb.0:
+; NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; NOEXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; NOEXPAND-NEXT:    v_mov_b32_e32 v1, 1
+; NOEXPAND-NEXT:    v_mov_b32_e32 v2, 2
+; NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:4
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:8
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:12
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:16
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:20
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:24
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:28
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:32
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:36
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:40
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:44
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    s_add_u32 s2, s2, s6
+; NOEXPAND-NEXT:    s_addc_u32 s3, s3, s7
+; NOEXPAND-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:48
+; NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; NOEXPAND-NEXT:    s_endpgm
+    ptr addrspace(1) %buf,
+    ptr addrspace(1) %data,
+    i64 %offset) {
+  ; Issue 12 stores to buffer - each increments vmcnt
+  %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0
+  store volatile i32 1, ptr addrspace(1) %ptr0, align 4
+  %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1
+  store volatile i32 2, ptr addrspace(1) %ptr1, align 4
+  %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2
+  store volatile i32 1, ptr addrspace(1) %ptr2, align 4
+  %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3
+  store volatile i32 2, ptr addrspace(1) %ptr3, align 4
+  %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4
+  store volatile i32 1, ptr addrspace(1) %ptr4, align 4
+  %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5
+  store volatile i32 2, ptr addrspace(1) %ptr5, align 4
+  %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6
+  store volatile i32 1, ptr addrspace(1) %ptr6, align 4
+  %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7
+  store volatile i32 2, ptr addrspace(1) %ptr7, align 4
+  %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8
+  store volatile i32 1, ptr addrspace(1) %ptr8, align 4
+  %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9
+  store volatile i32 2, ptr addrspace(1) %ptr9, align 4
+  %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10
+  store volatile i32 1, ptr addrspace(1) %ptr10, align 4
+  %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11
+  store volatile i32 2, ptr addrspace(1) %ptr11, align 4
+
+  ; Load from potentially aliasing address - also increments vmcnt
+  %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset
+  %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4
+
+  ; Store the loaded value
+  %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12
+  store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4
+
+  ret void
+}

>From 532c866dcc8079489eb60fff37d2c72cc6310687 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 24 Nov 2025 23:19:22 +0530
Subject: [PATCH 2/3] address review: remove subtarget integration

---
 clang/include/clang/Driver/Options.td  | 5 +----
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 ----
 clang/test/Driver/amdgpu-features.c    | 6 ------
 llvm/lib/Target/AMDGPU/AMDGPU.td       | 4 ----
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  | 5 -----
 5 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c0ba716484b6a..11e81e032d5fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5497,10 +5497,7 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
   " mode (AMDGPU only)">;
 defm amdgpu_precise_memory_op
     : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
-                  " precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>;
-defm amdgpu_expand_waitcnt_profiling
-    : SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable",
-                  " waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>;
+                  " precise memory mode (AMDGPU only)">;
 
 def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index f4ddb48c9abc6..1a243fef9532d 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -700,10 +700,6 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
                    options::OPT_mno_amdgpu_precise_memory_op, false))
     Features.push_back("+precise-memory");
 
-  if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling,
-                   options::OPT_mno_amdgpu_expand_waitcnt_profiling, false))
-    Features.push_back("+expand-waitcnt-profiling");
-
   handleTargetFeaturesGroup(D, Triple, Args, Features,
                             options::OPT_m_amdgpu_Features_Group);
 }
diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index 16b3f4121ab7a..864744db203e9 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -38,9 +38,3 @@
 
 // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
 // NO-PREC-MEM-NOT: {{".*precise-memory"}}
-
-// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s
-// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling"
-
-// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s
-// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 3f9166f48ea22..54d94b1f8682e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -223,10 +223,6 @@ def FeaturePreciseMemory
     : SubtargetFeature<"precise-memory", "EnablePreciseMemory",
                        "true", "Enable precise memory mode">;
 
-def FeatureExpandWaitcntProfiling
-    : SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling",
-                       "true", "Expand waitcnt instructions for profiling">;
-
 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "SGPRInitBug",
   "true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f2b885a790f41..f377b8aaf1333 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -90,7 +90,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool EnableCuMode = false;
   bool TrapHandler = false;
   bool EnablePreciseMemory = false;
-  bool EnableExpandWaitcntProfiling = false;
 
   // Used as options.
   bool EnableLoadStoreOpt = false;
@@ -675,10 +674,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
 
-  bool isExpandWaitcntProfilingEnabled() const {
-    return EnableExpandWaitcntProfiling;
-  }
-
   bool hasFlatAddressSpace() const {
     return FlatAddressSpace;
   }

>From 5242cfc9b73f989f534c95db59a09b6c22448945 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 25 Nov 2025 00:07:26 +0530
Subject: [PATCH 3/3] address review

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 314 ++++++++----------
 .../AMDGPU/expand-waitcnt-profiling.ll        |  30 +-
 2 files changed, 149 insertions(+), 195 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4a70479358bad..b81554caf9dd1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,6 +63,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
     cl::desc("Force all waitcnt load counters to wait until 0"),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> ExpandWaitcntProfiling(
+    "amdgpu-expand-waitcnt-profiling",
+    cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false),
+    cl::Hidden);
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -297,6 +302,30 @@ class WaitcntGenerator {
   // optimization.
   bool isOptNone() const { return OptNone; }
 
+  // Get the maximum wait count value for a given counter type
+  unsigned getWaitCountMax(InstCounterType T) const {
+    switch (T) {
+    case LOAD_CNT:
+      return AMDGPU::getLoadcntBitMask(IV);
+    case DS_CNT:
+      return AMDGPU::getDscntBitMask(IV);
+    case EXP_CNT:
+      return AMDGPU::getExpcntBitMask(IV);
+    case STORE_CNT:
+      return AMDGPU::getStorecntBitMask(IV);
+    case SAMPLE_CNT:
+      return AMDGPU::getSamplecntBitMask(IV);
+    case BVH_CNT:
+      return AMDGPU::getBvhcntBitMask(IV);
+    case KM_CNT:
+      return AMDGPU::getKmcntBitMask(IV);
+    case X_CNT:
+      return 0; // No hardware limit for XCNT
+    default:
+      return 0;
+    }
+  }
+
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
   // any new wait count instructions which may need to be generated by
@@ -318,9 +347,11 @@ class WaitcntGenerator {
 
   // Generates new wait count instructions according to the  value of
   // Wait, returning true if any new instructions were created.
+  // If ScoreBrackets is provided, it can be used for profiling expansion.
   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
                                 MachineBasicBlock::instr_iterator It,
-                                AMDGPU::Waitcnt Wait) = 0;
+                                AMDGPU::Waitcnt Wait,
+                                WaitcntBrackets *ScoreBrackets = nullptr) = 0;
 
   // Returns an array of bit masks which can be used to map values in
   // WaitEventType to corresponding counter values in InstCounterType.
@@ -356,7 +387,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
+                        AMDGPU::Waitcnt Wait,
+                        WaitcntBrackets *ScoreBrackets = nullptr) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -393,7 +425,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
+                        AMDGPU::Waitcnt Wait,
+                        WaitcntBrackets *ScoreBrackets = nullptr) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -494,16 +527,6 @@ class SIInsertWaitcnts {
   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
   bool run(MachineFunction &MF);
 
-  // Methods for expanding waitcnt instructions for profiling
-  bool expandWaitcntsForProfiling(MachineFunction &MF);
-  bool expandSingleWaitcnt(MachineInstr &MI, MachineBasicBlock &MBB);
-  bool expandSingleCounterWait(MachineInstr &MI, MachineBasicBlock &MBB,
-                               InstCounterType CT);
-  bool expandCounterSequence(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator InsertPos,
-                             InstCounterType CT, unsigned CountValue,
-                             DebugLoc DL);
-
   void setForceEmitWaitcnt() {
 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
 // For debug builds, get the debug counter info and adjust if need be
@@ -1533,7 +1556,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 /// required counters in \p Wait
 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
+    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
   assert(ST);
   assert(isNormalMode(MaxCounter));
 
@@ -1543,28 +1566,83 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
   // single instruction while VScnt has its own instruction.
   if (Wait.hasWaitExceptStoreCnt()) {
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-    Modified = true;
+    // If profiling expansion is enabled and we have score brackets,
+    // emit an expanded sequence
+    if (ExpandWaitcntProfiling && ScoreBrackets) {
+      // Emit expansion for each active counter
+      if (Wait.LoadCnt != ~0u) {
+        unsigned UB = ScoreBrackets->getScoreUB(LOAD_CNT);
+        unsigned LB = ScoreBrackets->getScoreLB(LOAD_CNT);
+        unsigned Outstanding = std::min(UB - LB, getWaitCountMax(LOAD_CNT) - 1);
+        for (unsigned i = Outstanding; i >= Wait.LoadCnt && i != ~0u; --i) {
+          AMDGPU::Waitcnt ExpandWait;
+          ExpandWait.LoadCnt = i;
+          unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+          Modified = true;
+        }
+      }
+      if (Wait.DsCnt != ~0u) {
+        unsigned UB = ScoreBrackets->getScoreUB(DS_CNT);
+        unsigned LB = ScoreBrackets->getScoreLB(DS_CNT);
+        unsigned Outstanding = std::min(UB - LB, getWaitCountMax(DS_CNT) - 1);
+        for (unsigned i = Outstanding; i >= Wait.DsCnt && i != ~0u; --i) {
+          AMDGPU::Waitcnt ExpandWait;
+          ExpandWait.DsCnt = i;
+          unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+          Modified = true;
+        }
+      }
+      if (Wait.ExpCnt != ~0u) {
+        unsigned UB = ScoreBrackets->getScoreUB(EXP_CNT);
+        unsigned LB = ScoreBrackets->getScoreLB(EXP_CNT);
+        unsigned Outstanding = std::min(UB - LB, getWaitCountMax(EXP_CNT) - 1);
+        for (unsigned i = Outstanding; i >= Wait.ExpCnt && i != ~0u; --i) {
+          AMDGPU::Waitcnt ExpandWait;
+          ExpandWait.ExpCnt = i;
+          unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait);
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+          Modified = true;
+        }
+      }
+    } else {
+      // Normal behavior: emit single combined waitcnt
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+      Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   if (Wait.hasWaitStoreCnt()) {
     assert(ST->hasVscnt());
 
-    [[maybe_unused]] auto SWaitInst =
+    if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) {
+      unsigned UB = ScoreBrackets->getScoreUB(STORE_CNT);
+      unsigned LB = ScoreBrackets->getScoreLB(STORE_CNT);
+      unsigned Outstanding = std::min(UB - LB, getWaitCountMax(STORE_CNT) - 1);
+      for (unsigned i = Outstanding; i >= Wait.StoreCnt && i != ~0u; --i) {
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Wait.StoreCnt);
-    Modified = true;
+            .addImm(i);
+        Modified = true;
+      }
+    } else {
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+              .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+              .addImm(Wait.StoreCnt);
+      Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   return Modified;
@@ -1787,13 +1865,36 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
+    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
   assert(ST);
   assert(!isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // For GFX12+, we use separate wait instructions, which makes expansion
+  // simpler
+  if (ExpandWaitcntProfiling && ScoreBrackets) {
+    // Emit expanded sequence for each active counter
+    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+      unsigned Count = getWait(Wait, CT);
+      if (Count == ~0u)
+        continue;
+
+      unsigned UB = ScoreBrackets->getScoreUB(CT);
+      unsigned LB = ScoreBrackets->getScoreLB(CT);
+      unsigned Outstanding = std::min(UB - LB, getWaitCountMax(CT) - 1);
+
+      for (unsigned i = Outstanding; i >= Count && i != ~0u; --i) {
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(i);
+        Modified = true;
+      }
+    }
+    return Modified;
+  }
+
+  // Normal behavior (no expansion)
   // Check for opportunities to use combined wait instructions.
   if (Wait.DsCnt != ~0u) {
     MachineInstr *SWaitInst = nullptr;
@@ -2185,7 +2286,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
       Wait.XCnt = ~0u;
   }
 
-  if (WCG->createNewWaitcnt(Block, It, Wait))
+  if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
     Modified = true;
 
   return Modified;
@@ -2735,156 +2836,6 @@ SIInsertWaitcntsPass::run(MachineFunction &MF,
       .preserve<AAManager>();
 }
 
-/// Expand waitcnt instructions for profiling by inserting a sequence of
-/// decreasing counter values. This helps identify which specific memory
-/// operation is a bottleneck during PC sampling.
-bool SIInsertWaitcnts::expandWaitcntsForProfiling(MachineFunction &MF) {
-  if (!ST->isExpandWaitcntProfilingEnabled())
-    return false;
-
-  bool Modified = false;
-
-  // Iterate through all basic blocks
-  for (MachineBasicBlock &MBB : MF) {
-    for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
-      MachineInstr &MI = *I;
-      ++I; // Advance iterator before potential expansion
-
-      if (ST->hasExtendedWaitCounts()) {
-        // GFX12+: Handle separate wait instructions
-        if (auto CT = counterTypeForInstr(MI.getOpcode())) {
-          Modified |= expandSingleCounterWait(MI, MBB, *CT);
-        }
-      } else {
-        // Pre-GFX12: Handle combined S_WAITCNT
-        if (MI.getOpcode() == AMDGPU::S_WAITCNT) {
-          Modified |= expandSingleWaitcnt(MI, MBB);
-        }
-      }
-    }
-  }
-
-  return Modified;
-}
-
-/// Expand a single S_WAITCNT instruction (pre-GFX12)
-bool SIInsertWaitcnts::expandSingleWaitcnt(MachineInstr &MI,
-                                           MachineBasicBlock &MBB) {
-  assert(MI.getOpcode() == AMDGPU::S_WAITCNT);
-
-  // Decode the waitcnt immediate
-  unsigned Imm = MI.getOperand(0).getImm();
-  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
-  AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt(IV, Imm);
-
-  // Insert expanded waitcnts BEFORE the original instruction
-  auto InsertPos = MI.getIterator();
-  DebugLoc DL = MI.getDebugLoc();
-
-  bool Modified = false;
-
-  // Expand each counter independently
-  // For independent counters (Case 2 from requirements):
-  // vmcnt and lgkmcnt can be separated
-  Modified |= expandCounterSequence(MBB, InsertPos, LOAD_CNT, Wait.LoadCnt, DL);
-  Modified |= expandCounterSequence(MBB, InsertPos, DS_CNT, Wait.DsCnt, DL);
-  Modified |= expandCounterSequence(MBB, InsertPos, EXP_CNT, Wait.ExpCnt, DL);
-  Modified |=
-      expandCounterSequence(MBB, InsertPos, STORE_CNT, Wait.StoreCnt, DL);
-
-  // If we expanded anything, remove the original waitcnt
-  if (Modified) {
-    MI.eraseFromParent();
-  }
-
-  return Modified;
-}
-
-/// Expand a single counter wait instruction (GFX12+)
-bool SIInsertWaitcnts::expandSingleCounterWait(MachineInstr &MI,
-                                               MachineBasicBlock &MBB,
-                                               InstCounterType CT) {
-  // Get the counter value from the instruction
-  unsigned CountValue = MI.getOperand(0).getImm();
-
-  // Insert expanded waitcnts BEFORE the original instruction
-  auto InsertPos = MI.getIterator();
-  DebugLoc DL = MI.getDebugLoc();
-
-  bool Modified = expandCounterSequence(MBB, InsertPos, CT, CountValue, DL);
-
-  // If we expanded, remove the original instruction
-  if (Modified) {
-    MI.eraseFromParent();
-  }
-
-  return Modified;
-}
-
-/// Insert a sequence of wait instructions with decreasing counter values
-bool SIInsertWaitcnts::expandCounterSequence(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos,
-    InstCounterType CT, unsigned CountValue, DebugLoc DL) {
-  // Skip if counter is already at zero, not active, or at max (wait not needed)
-  if (CountValue == 0 || CountValue == ~0u)
-    return false;
-
-  unsigned MaxCount = getWaitCountMax(CT);
-  if (CountValue >= MaxCount)
-    return false;
-
-  bool Modified = false;
-
-  // Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0
-  // We start from CountValue-1 because the original waitcnt already handles
-  // CountValue
-  for (int i = CountValue - 1; i >= 0; --i) {
-    if (ST->hasExtendedWaitCounts()) {
-      // GFX12+: Use separate wait instructions
-      unsigned Opcode = instrsForExtendedCounterTypes[CT];
-      BuildMI(MBB, InsertPos, DL, TII->get(Opcode)).addImm(i);
-    } else {
-      // Pre-GFX12: Use combined S_WAITCNT with only this counter set
-      AMDGPU::Waitcnt Wait;
-      switch (CT) {
-      case LOAD_CNT:
-        Wait.LoadCnt = i;
-        break;
-      case DS_CNT:
-        Wait.DsCnt = i;
-        break;
-      case EXP_CNT:
-        Wait.ExpCnt = i;
-        break;
-      case STORE_CNT:
-        Wait.StoreCnt = i;
-        break;
-      case SAMPLE_CNT:
-        Wait.SampleCnt = i;
-        break;
-      case BVH_CNT:
-        Wait.BvhCnt = i;
-        break;
-      case KM_CNT:
-        Wait.KmCnt = i;
-        break;
-      case X_CNT:
-        Wait.XCnt = i;
-        break;
-      default:
-        break;
-      }
-
-      AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
-      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-      BuildMI(MBB, InsertPos, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-    }
-    Modified = true;
-  }
-
-  return Modified;
-}
-
 bool SIInsertWaitcnts::run(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
@@ -3123,10 +3074,5 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   PreheadersToFlush.clear();
   SLoadAddresses.clear();
 
-  // Expand waitcnts for profiling if requested
-  if (ST->isExpandWaitcntProfilingEnabled()) {
-    Modified |= expandWaitcntsForProfiling(MF);
-  }
-
   return Modified;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index cc99c457677ad..b5583cfe2dc3b 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -1,16 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s
 
-; NOTE: These simple test cases are optimized to generate waitcnt(0) by the
-; time values are needed. The expansion feature correctly does NOT expand waitcnt(0).
-
-; Pattern: Multiple scalar loads that increment lgkmcnt, followed by use
-; Expected on real kernels with non-zero lgkmcnt:
-;   WITHOUT expansion: s_waitcnt lgkmcnt(0)
-;   WITH expansion:    s_waitcnt lgkmcnt(2)
-;                      s_waitcnt lgkmcnt(1)
-;                      s_waitcnt lgkmcnt(0)
+; This test demonstrates the waitcnt expansion feature for PC-sampling profiling.
+; The expansion transforms a single waitcnt instruction into a sequence of waitcnts
+; with decreasing counter values to help identify which specific memory operation
+; is causing a bottleneck.
+;
+; NOTE: These simple test cases are optimized such that by the time a wait is needed,
+; all outstanding operations have already been issued and can be waited on with a
+; single waitcnt. In this case, there are no outstanding operations at the wait point
+; (upper bound = target value), so no expansion occurs. This is correct behavior.
+;
+; In real-world kernels with complex control flow, there will be outstanding operations
+; when waits are inserted. For example, if 5 memory operations are outstanding and we
+; need to wait for 2 to complete, the expansion will generate:
+;   s_waitcnt lgkmcnt(4) ; wait for 1st op
+;   s_waitcnt lgkmcnt(3) ; wait for 2nd op
+;   s_waitcnt lgkmcnt(2) ; target reached
+; This allows PC-sampling to identify which specific operation is slow.
 
 define amdgpu_kernel void @case1_single_counter_lgkmcnt(
 ; EXPAND-LABEL: case1_single_counter_lgkmcnt: