[llvm] [AMDGPU][gfx1250] Also add a wait on xcnt before volatile accesses (PR #168852)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 03:01:10 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
---
Patch is 23.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168852.diff
16 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+4-2)
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+9-3)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/spillv16.ll (+14)
- (modified) llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir (+1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cb27f474d78f3..2eea58bd4b5e6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1872,8 +1872,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasClusters() const { return HasClusters; }
/// \returns true if the subtarget requires a wait for xcnt before atomic
- /// flat/global stores & rmw.
- bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+ /// stores and all volatile accesses for all isFLAT operations.
+ bool requiresWaitXCntBeforeAtomicStoreOrVolatileAccesses() const {
+ return GFX1250Insts;
+ }
/// \returns the number of significant bits in the immediate field of the
/// S_NOP instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bf04c7fa132c0..291dac83569e3 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2059,6 +2059,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
+ if (ST.requiresWaitXCntBeforeAtomicStoreOrVolatileAccesses() &&
+ TII->isFLAT(*MI)) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2077,9 +2084,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
bool Changed = false;
- // GFX12.5 only: xcnt wait is needed before flat and global atomics
- // stores/rmw.
- if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ if (Atomic && ST.requiresWaitXCntBeforeAtomicStoreOrVolatileAccesses() &&
+ TII->isFLAT(MI)) {
MachineBasicBlock &MBB = *MI.getParent();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 28d7e6916e519..453f98370efbc 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -5136,6 +5136,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
@@ -6215,6 +6216,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 5e2cec504c6a9..8d0131ccc6f7b 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -7409,8 +7409,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v1, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0
@@ -7431,8 +7433,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v1, v0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v0
@@ -7686,6 +7690,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v0, v1
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1
@@ -7706,6 +7711,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index bdde7c0975425..bde7e7e38d8fa 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -111,6 +111,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index a2a8ce75d7fb4..2a866e8c625b4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1303,6 +1303,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index c59b0ee83e955..7447ac68e7fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -150,6 +150,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -421,6 +422,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -582,6 +584,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -849,6 +852,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index ca7802d295e0b..36673c99ae056 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -92,6 +92,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 72cbbc0283545..8b04261ce37fb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1104,6 +1104,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 2a40ee532be98..d32f961976325 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -153,6 +153,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -361,6 +362,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -532,6 +534,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
@@ -733,6 +736,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 80ea48be0b893..7ed4491d0df66 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -89,6 +89,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 6c19722ad6e33..e5dba27b0acc9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -1080,6 +1080,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 7c23b76cec3e9..811a10cbe3ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -162,6 +162,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -357,6 +358,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -521,6 +523,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
@@ -706,6 +709,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX1250-NEXT: v_and_b32_e64 v1, v0, s2
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b32 v1, v0, s0 scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index c1764c94ea2de..4a0ce202e124a 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -352,6 +352,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX1250-NEXT: v_mov_b32_e32 v2, s5
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
@@ -410,6 +411,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX1250-NEXT: v_mov_b32_e32 v2, s5
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 9686c9d30b97c..c16793675f6a2 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -71,6 +71,7 @@ define void @spill_i16_alu() {
; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -87,6 +88,7 @@ define void @spill_i16_alu() {
; GFX1250-FAKE16-NEXT: ;;#ASMSTART
; GFX1250-FAKE16-NEXT: ;;#ASMEND
; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
@@ -215,8 +217,10 @@ define void @spill_i16_alu_two_vals() {
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -236,9 +240,11 @@ define void @spill_i16_alu_two_vals() {
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v1, s32 scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -325,6 +331,7 @@ define void @spill_i16() {
; GFX1250-NEXT: ;;#ASMSTART
; GFX1250-NEXT: ;;#ASMEND
; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -403,6 +410,7 @@ define void @spill_half() {
; GFX1250-NEXT: ;;#ASMSTART
; GFX1250-NEXT: ;;#ASMEND
; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -481,6 +489,7 @@ define void @spill_i16_from_v2i16() {
; GFX1250-NEXT: ;;#ASMSTART
; GFX1250-NEXT: ;;#ASMEND
; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byt...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/168852
More information about the llvm-commits
mailing list