[llvm] [AMDGPU][SIMemoryLegalizer][GFX12] Correctly insert sample/bvhcnt (PR #161637)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 2 01:24:45 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
The check used was not strong enough to prevent the insertion of sample/bvhcnt when they were not needed.
I assume SIInsertWaitCnts was trimming those away anyway, but this was a bug
nonetheless.
---
Patch is 197.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161637.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+54-33)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (-104)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll (-104)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (-104)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (-52)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (-104)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll (-104)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (-104)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (-52)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 484861dcaac07..362ef140a28f8 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -360,11 +360,13 @@ class SICacheControl {
/// between memory instructions to enforce the order they become visible as
/// observed by other memory instructions executing in memory scope \p Scope.
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
- /// address spaces. Returns true iff any instructions inserted.
+ /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
+ /// that are used by atomic instructions.
+ /// Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const = 0;
+ AtomicOrdering Order, bool AtomicsOnly) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure any subsequent memory instructions of this
@@ -437,7 +439,7 @@ class SIGfx6CacheControl : public SICacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -484,7 +486,7 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -572,7 +574,7 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -629,7 +631,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -1120,7 +1122,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1140,7 +1143,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const {
+ AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -1294,7 +1298,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
}
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1447,7 +1452,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1467,8 +1473,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos,
- AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
if (ST.isTgSplitEnabled()) {
// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to wait for global or GDS memory operations
@@ -1488,7 +1494,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
AddrSpace &= ~SIAtomicAddrSpace::LDS;
}
return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
- IsCrossAddrSpaceOrdering, Pos, Order);
+ IsCrossAddrSpaceOrdering, Pos, Order,
+ AtomicsOnly);
}
bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1747,7 +1754,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1904,7 +1912,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
// S_WAITCNT needed.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1984,7 +1993,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -2007,7 +2017,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -2444,7 +2457,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
+ if (!AtomicsOnly && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2587,7 +2600,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
// we of course need to wait for that as well.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -2624,7 +2638,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
}
return Changed;
@@ -2748,13 +2763,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE, Order);
+ Position::BEFORE, Order, /*AtomicsOnly=*/false);
if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
- MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+ // The wait below only needs to wait on the prior atomic.
+ Changed |=
+ CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER, Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
@@ -2830,9 +2847,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
if (Order == AtomicOrdering::Acquire) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
+ // Acquire fences only need to wait on the previous atomic they pair with.
+ Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE, Order, /*AtomicsOnly=*/true);
}
if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2916,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Order == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), MOI.getInstrAddrSpace(),
- isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+ // Only wait on the previous atomic.
+ Changed |=
+ CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
+ Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 4caaad646378d..9e2906cf85432 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
@@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -2942,8 +2938,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
@@ -2964,8 +2958,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -3196,8 +3188,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
@@ -3218,8 +3208,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9001,8 +8989,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
@@ -9027,8 +9013,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9349,8 +9333,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
@@ -9375,8 +9357,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
@@ -9677,8 +9657,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
@@ -9699,8 +9677,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/161637
More information about the llvm-commits
mailing list