[llvm] [AMDGPU][SIInsertWaitCnts] Gfx12.5 - Refactor xcnt optimization (PR #164357)
Ryan Mitchell via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 20 21:56:55 PDT 2025
https://github.com/RyanRio created https://github.com/llvm/llvm-project/pull/164357
Refactor the XCnt optimization checks so that they can be checked when applying a pre-existing waitcnt. This removes unnecessary xcnt waits when taking a loop backedge.
>From 8ad547954ca0488697d673150f506c267e07395d Mon Sep 17 00:00:00 2001
From: Ryan Mitchell <Ryan.Mitchell at amd.com>
Date: Mon, 20 Oct 2025 21:35:18 -0700
Subject: [PATCH] [AMDGPU][SIInsertWaitCnts] Refactor xcnt optimization
Refactor the XCnt optimization checks so that they can be checked when
applying a pre-existing waitcnt. This has the effect of removing
unnecessary xcnt waits when taking a loop backedge.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 25 ++++++--
llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 7 +--
llvm/test/CodeGen/AMDGPU/fmin3.ll | 2 -
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 60 +++++++------------
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 1 -
5 files changed, 42 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced010a5a..1aeb25248436c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -646,6 +646,8 @@ class WaitcntBrackets {
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
+ bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
+ bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);
@@ -1287,20 +1289,25 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
- return applyWaitcnt(X_CNT, 0);
+ return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
+}
+bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
- if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT))
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && !hasPendingEvent(STORE_CNT);
+}
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+ if (hasRedundantXCntWithKmCnt(Wait))
+ return applyWaitcnt(X_CNT, 0);
+ if (canOptimizeXCntWithLoadCnt(Wait))
+ return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
applyWaitcnt(X_CNT, Wait.XCnt);
}
@@ -1729,6 +1736,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
if (!WaitInstrs[CT])
continue;
+ if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(Wait)) ||
+ (CT == LOAD_CNT && ScoreBrackets.canOptimizeXCntWithLoadCnt(Wait)))
+ // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
+ // due to taking the backedge of a block.
+ ScoreBrackets.applyXcnt(Wait);
+
unsigned NewCnt = getWait(Wait, CT);
if (NewCnt != ~0u) {
Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index b5b2655246c3f..fd1e8807885a5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2115,7 +2115,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2134,7 +2134,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
@@ -2170,7 +2169,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2191,7 +2190,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 6a6f232c55e24..2756472652bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: s_mov_b32 s4, s14
; GFX1250-NEXT: s_mov_b32 s5, s15
; GFX1250-NEXT: s_mov_b32 s0, s8
@@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: s_mov_b32 s4, s14
; GFX1250-NEXT: s_mov_b32 s5, s15
; GFX1250-NEXT: s_mov_b32 s0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index a50791e10f5a2..ed565ca43f9a3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -8814,7 +8814,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8857,7 +8857,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9322,7 +9322,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9365,7 +9365,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9844,7 +9844,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9888,7 +9888,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10365,7 +10365,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -10407,7 +10406,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -10857,7 +10855,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -10899,7 +10896,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11363,7 +11359,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11406,7 +11401,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11861,7 +11855,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11893,7 +11887,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12245,7 +12239,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12276,7 +12269,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12631,7 +12623,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12674,7 +12666,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13154,7 +13146,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13196,7 +13187,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13676,7 +13666,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13722,7 +13712,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14273,7 +14263,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14319,7 +14309,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14888,7 +14878,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14936,7 +14926,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15502,7 +15492,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15547,7 +15536,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16081,7 +16069,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16126,7 +16113,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16678,7 +16664,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -16725,7 +16710,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -17269,7 +17253,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17305,7 +17289,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17753,7 +17737,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -17788,7 +17771,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -18238,7 +18220,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB62_1
; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18284,7 +18266,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB62_1
; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18854,7 +18836,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -18899,7 +18880,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 355d0026091d9..7aecae901becf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -101,7 +101,6 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
; GFX1250-NEXT: s_endpgm
More information about the llvm-commits
mailing list