[llvm] [AMDGPU][SIInsertWaitCnts] Gfx12.5 - Refactor xcnt optimization (PR #164357)
Ryan Mitchell via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 9 23:21:03 PST 2025
https://github.com/RyanRio updated https://github.com/llvm/llvm-project/pull/164357
>From 4df30af0999a6af9036fdc28d38862eb17528e8e Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Thu, 30 Oct 2025 11:39:00 -0700
Subject: [PATCH 01/10] [AMDGPU][SIInsertWaitCnts] Refactor xcnt optimization
Refactor the XCnt optimization checks so that they can be checked when
applying a pre-existing waitcnt. This has the effect of removing
unnecessary xcnt waits when taking a loop backedge.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 53 ++++++++--------
llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fmin3.ll | 2 -
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 60 +++++++------------
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 1 -
5 files changed, 51 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7fa899678ec7..b6e2c3a7f8950 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -646,6 +646,8 @@ class WaitcntBrackets {
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
+ bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
+ bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);
@@ -1287,40 +1289,35 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
- // On entry to a block with multiple predescessors, there may
- // be pending SMEM and VMEM events active at the same time.
- // In such cases, only clear one active event at a time.
- auto applyPendingXcntGroup = [this](unsigned E) {
- unsigned LowerBound = getScoreLB(X_CNT);
- applyWaitcnt(X_CNT, 0);
- PendingEvents |= (1 << E);
- setScoreLB(X_CNT, LowerBound);
- };
-
+bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
- if (hasPendingEvent(VMEM_GROUP))
- applyPendingXcntGroup(VMEM_GROUP);
- else
- applyWaitcnt(X_CNT, 0);
- return;
- }
+ return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
+}
+bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
- if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT)) {
- if (hasPendingEvent(SMEM_GROUP))
- applyPendingXcntGroup(SMEM_GROUP);
+ return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+ !hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
+}
+
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+ if (hasRedundantXCntWithKmCnt(Wait)) {
+ if (hasPendingEvent(VMEM_GROUP))
+ // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require handling.
+ PendingEvents &= ~(1 << SMEM_GROUP);
else
- applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ applyWaitcnt(X_CNT, 0);
return;
}
-
+ if (canOptimizeXCntWithLoadCnt(Wait))
+ // On entry to a block with multiple predescessors, there may
+ // be pending SMEM and VMEM events active at the same time.
+ // In such cases, only clear one active event at a time.
+ return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
applyWaitcnt(X_CNT, Wait.XCnt);
}
@@ -1656,6 +1653,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
}
+ // Save the pre combine waitcnt in order to make xcnt checks.
+ AMDGPU::Waitcnt PreCombine = Wait;
if (CombinedLoadDsCntInstr) {
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
// to be waited for. Otherwise, let the instruction be deleted so
@@ -1746,6 +1745,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
+ (CT == LOAD_CNT &&
+ ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine)))
+ // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
+ // due to taking the backedge of a block.
+ ScoreBrackets.applyXcnt(PreCombine);
if (!WaitInstrs[CT])
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 31344c78990b8..f5feeb2f49171 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2107,7 +2107,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2126,7 +2126,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
@@ -2162,7 +2161,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2183,7 +2182,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 6a6f232c55e24..2756472652bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: s_mov_b32 s4, s14
; GFX1250-NEXT: s_mov_b32 s5, s15
; GFX1250-NEXT: s_mov_b32 s0, s8
@@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: s_mov_b32 s4, s14
; GFX1250-NEXT: s_mov_b32 s5, s15
; GFX1250-NEXT: s_mov_b32 s0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index a50791e10f5a2..ed565ca43f9a3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -8814,7 +8814,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8857,7 +8857,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9322,7 +9322,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9365,7 +9365,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9844,7 +9844,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9888,7 +9888,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10365,7 +10365,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -10407,7 +10406,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -10857,7 +10855,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -10899,7 +10896,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11363,7 +11359,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11406,7 +11401,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11861,7 +11855,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11893,7 +11887,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12245,7 +12239,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12276,7 +12269,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12631,7 +12623,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12674,7 +12666,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13154,7 +13146,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13196,7 +13187,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13676,7 +13666,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13722,7 +13712,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14273,7 +14263,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14319,7 +14309,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14888,7 +14878,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14936,7 +14926,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15502,7 +15492,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15547,7 +15536,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16081,7 +16069,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16126,7 +16113,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16678,7 +16664,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16725,7 +16710,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -17269,7 +17253,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17305,7 +17289,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17753,7 +17737,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -17788,7 +17771,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -18238,7 +18220,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB62_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18284,7 +18266,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB62_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18854,7 +18836,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -18899,7 +18880,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 355d0026091d9..7aecae901becf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -101,7 +101,6 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
; GFX1250-NEXT: s_endpgm
>From 0ac18c100fb316b44d65be3ae3e4ca99e9b3523b Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Mon, 3 Nov 2025 13:33:04 -0800
Subject: [PATCH 02/10] [AMDGPU][SIInsertWaitCnts] Add mixed pending event xcnt
test
---
llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 45 ++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index f964480dcc633..f6b547594ccc0 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1114,6 +1114,51 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
+---
+name: mixed_pending_events
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: mixed_pending_events
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr2, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_LOADCNT 1
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $sgpr2
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr2, $vgpr2
+ $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
---
name: pending_vmem_event_between_block
tracksRegLiveness: true
>From d87068d8cf93f0ffc888dcdca795508524b1a20f Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Mon, 3 Nov 2025 13:52:44 -0800
Subject: [PATCH 03/10] [AMDGPU][SIInsertWaitCnts] format
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b6e2c3a7f8950..1da7e8ceb6c6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1307,7 +1307,8 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
if (hasRedundantXCntWithKmCnt(Wait)) {
if (hasPendingEvent(VMEM_GROUP))
- // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require handling.
+ // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
+ // handling.
PendingEvents &= ~(1 << SMEM_GROUP);
else
applyWaitcnt(X_CNT, 0);
>From cca7a3ebca7ef6d2f4441447d7ddffb2e7d274e9 Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Mon, 3 Nov 2025 14:09:22 -0800
Subject: [PATCH 04/10] [AMDGPU][SIInsertWaitCnts] format braces
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1da7e8ceb6c6f..17ebf5ca41d04 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1306,19 +1306,21 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
if (hasRedundantXCntWithKmCnt(Wait)) {
- if (hasPendingEvent(VMEM_GROUP))
+ if (hasPendingEvent(VMEM_GROUP)) {
// Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
// handling.
PendingEvents &= ~(1 << SMEM_GROUP);
- else
+ } else {
applyWaitcnt(X_CNT, 0);
+ }
return;
}
- if (canOptimizeXCntWithLoadCnt(Wait))
+ if (canOptimizeXCntWithLoadCnt(Wait)) {
// On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ }
applyWaitcnt(X_CNT, Wait.XCnt);
}
@@ -1748,10 +1750,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
(CT == LOAD_CNT &&
- ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine)))
+ ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
// due to taking the backedge of a block.
ScoreBrackets.applyXcnt(PreCombine);
+ }
if (!WaitInstrs[CT])
continue;
>From 44466c208d7e3d3e08e0d243c8dbf00ae6b30f0d Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Tue, 4 Nov 2025 11:17:55 -0800
Subject: [PATCH 05/10] [AMDGPU][SIInsertWaitCnts] use simplifyWaitcnt code
path
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 51 +++++++++------------
1 file changed, 21 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 17ebf5ca41d04..55dcc911e1507 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -633,8 +633,11 @@ class WaitcntBrackets {
const MachineOperand &Op) const;
bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
+ bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
+ void simplifyXcnt(AMDGPU::Waitcnt &Wait);
void determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const;
@@ -646,9 +649,6 @@ class WaitcntBrackets {
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
- bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
- bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);
unsigned hasPendingEvent() const { return PendingEvents; }
@@ -1194,7 +1194,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
@@ -1202,7 +1202,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
- simplifyWaitcnt(X_CNT, Wait.XCnt);
+ simplifyXcnt(Wait);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1272,7 +1272,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
- applyXcnt(Wait);
+ applyWaitcnt(X_CNT, Wait.XCnt);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1304,7 +1304,11 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
!hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
}
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &Wait) {
+ // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
+ // optimizations. On entry to a block with multiple predescessors, there may
+ // be pending SMEM and VMEM events active at the same time.
+ // In such cases, only clear one active event at a time.
if (hasRedundantXCntWithKmCnt(Wait)) {
if (hasPendingEvent(VMEM_GROUP)) {
// Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
@@ -1313,15 +1317,10 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
} else {
applyWaitcnt(X_CNT, 0);
}
- return;
- }
- if (canOptimizeXCntWithLoadCnt(Wait)) {
- // On entry to a block with multiple predescessors, there may
- // be pending SMEM and VMEM events active at the same time.
- // In such cases, only clear one active event at a time.
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ } else if (canOptimizeXCntWithLoadCnt(Wait)) {
+ applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
}
- applyWaitcnt(X_CNT, Wait.XCnt);
+ simplifyWaitcnt(X_CNT, Wait.XCnt);
}
// Where there are multiple types of event in the bracket of a counter,
@@ -1753,7 +1752,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
// due to taking the backedge of a block.
- ScoreBrackets.applyXcnt(PreCombine);
+ ScoreBrackets.simplifyXcnt(PreCombine);
}
if (!WaitInstrs[CT])
continue;
@@ -2169,19 +2168,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- // XCnt may be already consumed by a load wait.
- if (Wait.XCnt != ~0u) {
- if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
- Wait.XCnt = ~0u;
-
- if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
- Wait.XCnt = ~0u;
-
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (isVmemAccess(*It))
- Wait.XCnt = ~0u;
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(*It)) {
+ Wait.XCnt = ~0u;
}
if (WCG->createNewWaitcnt(Block, It, Wait))
>From 4770a1eb28e59d4f6faa2f291ba4bd890825bc58 Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Tue, 4 Nov 2025 12:03:16 -0800
Subject: [PATCH 06/10] [AMDGPU][SIInsertWaitCnts] move vmemaccess check
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 55dcc911e1507..ba4c3b8ef3cec 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2097,6 +2097,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.DsCnt = 0;
}
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (isVmemAccess(MI)) {
+ Wait.XCnt = ~0u;
+ }
+
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
@@ -2168,13 +2175,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (Wait.XCnt != ~0u && isVmemAccess(*It)) {
- Wait.XCnt = ~0u;
- }
-
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
>From 64fb73ed90bf44c99daf8f792bf85b690db034b7 Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Wed, 5 Nov 2025 15:35:47 -0800
Subject: [PATCH 07/10] Revert "[AMDGPU][SIInsertWaitCnts] move vmemaccess
check"
This reverts commit d3ed9c9be4142a9cecb6629e5affac8c4468d03f.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ba4c3b8ef3cec..55dcc911e1507 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2097,13 +2097,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.DsCnt = 0;
}
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (isVmemAccess(MI)) {
- Wait.XCnt = ~0u;
- }
-
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
@@ -2175,6 +2168,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(*It)) {
+ Wait.XCnt = ~0u;
+ }
+
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
>From 2683aa720c1b69c6254cd3c13d84a22f1f8584dd Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Wed, 5 Nov 2025 16:22:45 -0800
Subject: [PATCH 08/10] [AMDGPU][SIInsertWaitCnts] move vmem access check and
apply waitcnt
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 31 +++++++++++----------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 55dcc911e1507..b7140982065f0 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -637,7 +637,7 @@ class WaitcntBrackets {
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
- void simplifyXcnt(AMDGPU::Waitcnt &Wait);
+ void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
void determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const;
@@ -1202,7 +1202,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
- simplifyXcnt(Wait);
+ simplifyXcnt(Wait, Wait);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1304,12 +1304,12 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
!hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
}
-void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &Wait) {
+void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait) {
// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
// optimizations. On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
- if (hasRedundantXCntWithKmCnt(Wait)) {
+ if (hasRedundantXCntWithKmCnt(CheckWait)) {
if (hasPendingEvent(VMEM_GROUP)) {
// Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
// handling.
@@ -1317,10 +1317,10 @@ void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &Wait) {
} else {
applyWaitcnt(X_CNT, 0);
}
- } else if (canOptimizeXCntWithLoadCnt(Wait)) {
- applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
+ applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
}
- simplifyWaitcnt(X_CNT, Wait.XCnt);
+ simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
}
// Where there are multiple types of event in the bracket of a counter,
@@ -1752,7 +1752,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
// due to taking the backedge of a block.
- ScoreBrackets.simplifyXcnt(PreCombine);
+ ScoreBrackets.simplifyXcnt(PreCombine, Wait);
}
if (!WaitInstrs[CT])
continue;
@@ -2100,6 +2100,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
+ // Since the translation for VMEM addresses occur in-order, we can apply the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
+ ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
+ Wait.XCnt = ~0u;
+ }
+
// When forcing emit, we need to skip terminators because that would break the
// terminators of the MBB if we emit a waitcnt between terminators.
if (ForceEmitZeroFlag && !MI.isTerminator())
@@ -2168,13 +2176,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory
- // dependency with another VMEM instruction in flight.
- if (Wait.XCnt != ~0u && isVmemAccess(*It)) {
- Wait.XCnt = ~0u;
- }
-
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
>From a1ee5b8773b6380b7354e036137d35b89139fafb Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Wed, 5 Nov 2025 16:25:18 -0800
Subject: [PATCH 09/10] [AMDGPU][SIInsertWaitCnts] format
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7140982065f0..c4a605618cdbf 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1304,7 +1304,8 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
!hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
}
-void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait) {
+void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
+ AMDGPU::Waitcnt &UpdateWait) {
// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
// optimizations. On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
>From 7708bb6dae18f7a15eb29e658c456d96d4b5093c Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Sun, 9 Nov 2025 23:20:42 -0800
Subject: [PATCH 10/10] [AMDGPU][SIInsertWaitCnts] test changes, use jayfoad
patch
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 16 ++++---
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 9 ----
llvm/test/CodeGen/AMDGPU/bf16.ll | 14 ++++++
.../AMDGPU/branch-relaxation-gfx1250.ll | 1 -
.../AMDGPU/flat-load-saddr-to-vaddr.ll | 1 -
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll | 2 -
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 9 ----
.../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 1 -
.../AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll | 2 +
llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 44 -------------------
10 files changed, 25 insertions(+), 74 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c4a605618cdbf..7d0f904ab89f6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1301,7 +1301,7 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
+ !hasPendingEvent(STORE_CNT);
}
void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
@@ -1311,15 +1311,17 @@ void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
if (hasRedundantXCntWithKmCnt(CheckWait)) {
- if (hasPendingEvent(VMEM_GROUP)) {
- // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
- // handling.
- PendingEvents &= ~(1 << SMEM_GROUP);
- } else {
+ if (!hasMixedPendingEvents(X_CNT)) {
applyWaitcnt(X_CNT, 0);
+ } else {
+ PendingEvents &= ~(1 << SMEM_GROUP);
}
} else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
- applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
+ if (!hasMixedPendingEvents(X_CNT)) {
+ applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
+ } else if (CheckWait.LoadCnt == 0) {
+ PendingEvents &= ~(1 << VMEM_GROUP);
+ }
}
simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 7e297f46a780e..b5d593a9c15ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1501,7 +1501,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1574,7 +1573,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1649,7 +1647,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1722,7 +1719,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1913,7 +1909,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1959,7 +1954,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2002,7 +1996,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2047,7 +2040,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2210,7 +2202,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index f3885d6dadf9b..df1d6b8751271 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2520,6 +2520,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2783,6 +2784,7 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2872,6 +2874,7 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
@@ -6850,6 +6853,7 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -6943,6 +6947,7 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7033,6 +7038,7 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7134,6 +7140,7 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7251,6 +7258,7 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
@@ -7367,6 +7375,7 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
@@ -8001,6 +8010,7 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -8241,6 +8251,7 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
@@ -8377,6 +8388,7 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
@@ -8522,6 +8534,7 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5
@@ -8693,6 +8706,7 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index f465e3c505c02..31307b245bafe 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -152,7 +152,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
index e8efa859ce13f..213233e802a96 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -27,7 +27,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 4, v[0:1]
; GCN-NEXT: v_add_co_u32 v2, s0, v2, 1
; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
index ba761bedb905c..9e5a4428b011f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
@@ -38,7 +38,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -80,7 +79,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 6484c2f82ff94..831af7b6c10ba 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1473,7 +1473,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1516,7 +1515,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1561,7 +1559,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1604,7 +1601,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1776,7 +1772,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1821,7 +1816,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1864,7 +1858,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1909,7 +1902,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2083,7 +2075,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 3324018ca7237..5b36d4cefa2e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -66,7 +66,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
index d9f2fc55709a6..f7ed5341141d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
@@ -330,6 +330,7 @@ define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr a
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -348,6 +349,7 @@ define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr a
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index f6b547594ccc0..4f2b375acd319 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1070,50 +1070,6 @@ body: |
...
# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
----
-name: mixed_pending_events
-tracksRegLiveness: true
-machineFunctionInfo:
- isEntryFunction: true
-body: |
- ; GCN-LABEL: name: mixed_pending_events
- ; GCN: bb.0:
- ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
- ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.2:
- ; GCN-NEXT: liveins: $sgpr2, $vgpr2
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: S_WAIT_LOADCNT 1
- ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
- ; GCN-NEXT: S_WAIT_KMCNT 0
- ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
- ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- bb.0:
- liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
- $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
- S_CBRANCH_SCC1 %bb.2, implicit $scc
- bb.1:
- liveins: $vgpr0_vgpr1, $sgpr2
- $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- bb.2:
- liveins: $sgpr2, $vgpr2
- $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
- $sgpr2 = S_MOV_B32 $sgpr2
- $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-...
-
---
name: mixed_pending_events
tracksRegLiveness: true
More information about the llvm-commits
mailing list