[llvm-branch-commits] [llvm] [AMDGPU] Implement "non-av" semantics using metadata (PR #199489)
Sameer Sahasrabuddhe via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed May 27 02:38:35 PDT 2026
https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/199489
>From 6aa8e1cf9581f3d764edc7b3614dd0231ebaf506 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Mon, 25 May 2026 10:39:17 +0530
Subject: [PATCH 1/3] [AMDGPU] Implement "non-av" semantics using metadata
When the MMRA tag !{!"amdgcn-av", !"none"} is present on a synchronization
operation (fence, atomic load/store/rmw/cmpxchg), suppress cache writeback
(MakeAvailable) and cache invalidation (MakeVisible) while preserving
memory ordering (waits).
This implements the metadata proposed in #191246.
Fixes: LCOMPILER-2214
Assisted-By: Claude Opus 4.6
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 76 +-
.../AMDGPU/memory-legalizer-av-none.ll | 722 ++++++++++++++++++
2 files changed, 769 insertions(+), 29 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index b721dcaf49d0f..f16192d343531 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -148,6 +148,7 @@ class SIMemOpInfo final {
bool IsNonTemporal = false;
bool IsLastUse = false;
bool IsCooperative = false;
+ bool IsAVNone = false;
// TODO: Should we assume Cooperative=true if no MMO is present?
SIMemOpInfo(
@@ -160,12 +161,12 @@ class SIMemOpInfo final {
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false, bool IsNonTemporal = false,
bool IsLastUse = false, bool IsCooperative = false,
- bool CanDemoteWorkgroupToWavefront = false)
+ bool CanDemoteWorkgroupToWavefront = false, bool IsAVNone = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
- IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
+ IsLastUse(IsLastUse), IsCooperative(IsCooperative), IsAVNone(IsAVNone) {
if (Ordering == AtomicOrdering::NotAtomic) {
assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
@@ -277,6 +278,9 @@ class SIMemOpInfo final {
/// \returns True if this is a cooperative load or store atomic.
bool isCooperative() const { return IsCooperative; }
+ /// \returns True if MakeAvailable/MakeVisible should be suppressed.
+ bool isAVNone() const { return IsAVNone; }
+
/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
bool isAtomic() const {
@@ -451,13 +455,13 @@ class SICacheControl {
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace,
Position Pos) const = 0;
- /// Inserts writeback followed by an unconditional wait to implement a
- /// release operation.
+ /// Inserts writeback (unless \p IsAVNone) followed by an unconditional wait.
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
+ Position Pos, bool IsAVNone) const {
bool Changed = false;
- Changed |= insertWriteback(MI, Scope, AddrSpace, Pos);
+ if (!IsAVNone)
+ Changed |= insertWriteback(MI, Scope, AddrSpace, Pos);
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos,
AtomicOrdering::Release, /*AtomicsOnly=*/false);
@@ -733,6 +737,13 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
return Result;
}
+static bool hasAVNoneMMRA(const MachineInstr &MI) {
+ auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
+ if (!MMRA)
+ return false;
+ return MMRA.hasTag("amdgcn-av", "none");
+}
+
} // end anonymous namespace
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -876,7 +887,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
IsNonTemporal, IsLastUse, IsCooperative,
- CanDemoteWorkgroupToWavefront);
+ CanDemoteWorkgroupToWavefront, hasAVNoneMMRA(*MI));
}
std::optional<SIMemOpInfo>
@@ -946,7 +957,7 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
AtomicOrdering::NotAtomic, false, false, false, false,
- CanDemoteWorkgroupToWavefront);
+ CanDemoteWorkgroupToWavefront, hasAVNoneMMRA(*MI));
}
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2317,9 +2328,10 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER, Order, /*AtomicsOnly=*/true);
- Changed |= CC->insertAcquire(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ if (!MOI.isAVNone()) {
+ Changed |= CC->insertAcquire(
+ MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER);
+ }
}
return Changed;
@@ -2363,11 +2375,12 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
Changed |= CC->handleCooperativeAtomic(*MI);
if (MOI.getOrdering() == AtomicOrdering::Release ||
- MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertRelease(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |=
+ CC->insertRelease(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE, MOI.isAVNone());
+ }
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
@@ -2412,7 +2425,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
if (Order == AtomicOrdering::Release ||
Order == AtomicOrdering::AcquireRelease ||
- Order == AtomicOrdering::SequentiallyConsistent)
+ Order == AtomicOrdering::SequentiallyConsistent) {
/// TODO: This relies on a barrier always generating a waitcnt
/// for LDS to ensure it is not reordered with the completion of
/// the proceeding LDS operations. If barrier had a memory
@@ -2422,18 +2435,21 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
/// adding S_WAITCNT before a S_BARRIER.
Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Position::BEFORE, MOI.isAVNone());
+ }
// TODO: If both release and invalidate are happening they could be combined
// to use the single "BUFFER_WBINV*" instruction. This could be done by
// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
// track cache invalidate and write back instructions.
- if (Order == AtomicOrdering::Acquire ||
- Order == AtomicOrdering::AcquireRelease ||
- Order == AtomicOrdering::SequentiallyConsistent)
+ if ((Order == AtomicOrdering::Acquire ||
+ Order == AtomicOrdering::AcquireRelease ||
+ Order == AtomicOrdering::SequentiallyConsistent) &&
+ !MOI.isAVNone()) {
Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
Position::BEFORE);
+ }
return Changed;
}
@@ -2469,11 +2485,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
if (Order == AtomicOrdering::Release ||
Order == AtomicOrdering::AcquireRelease ||
Order == AtomicOrdering::SequentiallyConsistent ||
- MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertRelease(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |=
+ CC->insertRelease(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE, MOI.isAVNone());
+ }
if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::AcquireRelease ||
@@ -2486,9 +2503,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
Order, /*AtomicsOnly=*/true);
- Changed |= CC->insertAcquire(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ if (!MOI.isAVNone()) {
+ Changed |= CC->insertAcquire(
+ MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER);
+ }
}
Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll
new file mode 100644
index 0000000000000..89230fe1b7cdd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+
+; Test that !amdgcn-av-none suppresses MakeAvailable/MakeVisible (cache
+; writeback/invalidation) while preserving ordering (waits).
+
+; Fences: one per scope, varying orderings.
+
+define amdgpu_kernel void @workgroup_acq_rel_fence_av_none() {
+; GFX90A-LABEL: workgroup_acq_rel_fence_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_acq_rel_fence_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_acq_rel_fence_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @cluster_seq_cst_fence_av_none() {
+; GFX90A-LABEL: cluster_seq_cst_fence_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_seq_cst_fence_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_seq_cst_fence_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: cluster_seq_cst_fence_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_endpgm
+entry:
+ fence syncscope("cluster") seq_cst, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_acquire_fence_av_none() {
+; GFX90A-LABEL: agent_acquire_fence_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_acquire_fence_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: agent_acquire_fence_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: agent_acquire_fence_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_release_fence_av_none() {
+; GFX90A-LABEL: agent_release_fence_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: agent_release_fence_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: agent_release_fence_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: agent_release_fence_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_seq_cst_fence_av_none() {
+; GFX90A-LABEL: system_seq_cst_fence_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: system_seq_cst_fence_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: system_seq_cst_fence_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_endpgm
+entry:
+ fence seq_cst, !mmra !0
+ ret void
+}
+
+; Atomic loads: acquire across scopes.
+
+define i32 @workgroup_acquire_load_av_none(ptr addrspace(1) %ptr) {
+; GFX90A-LABEL: workgroup_acquire_load_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_load_dword v0, v[0:1], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_acquire_load_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: workgroup_acquire_load_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-WGP-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: workgroup_acquire_load_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %val = load atomic i32, ptr addrspace(1) %ptr syncscope("workgroup") acquire, align 4, !mmra !0
+ ret i32 %val
+}
+
+define i32 @agent_acquire_load_av_none(ptr addrspace(1) %ptr) {
+; GFX90A-LABEL: agent_acquire_load_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: agent_acquire_load_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: agent_acquire_load_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-WGP-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: agent_acquire_load_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %val = load atomic i32, ptr addrspace(1) %ptr syncscope("agent") acquire, align 4, !mmra !0
+ ret i32 %val
+}
+
+define i32 @system_acquire_load_av_none(ptr addrspace(1) %ptr) {
+; GFX90A-LABEL: system_acquire_load_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: system_acquire_load_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: system_acquire_load_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-WGP-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: system_acquire_load_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %val = load atomic i32, ptr addrspace(1) %ptr acquire, align 4, !mmra !0
+ ret i32 %val
+}
+
+; Atomic stores: release across scopes.
+
+define void @workgroup_release_store_av_none(ptr addrspace(1) %ptr, i32 %val) {
+; GFX90A-LABEL: workgroup_release_store_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: workgroup_release_store_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: workgroup_release_store_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: workgroup_release_store_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ store atomic i32 %val, ptr addrspace(1) %ptr syncscope("workgroup") release, align 4, !mmra !0
+ ret void
+}
+
+define void @agent_release_store_av_none(ptr addrspace(1) %ptr, i32 %val) {
+; GFX90A-LABEL: agent_release_store_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: agent_release_store_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: agent_release_store_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: agent_release_store_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ store atomic i32 %val, ptr addrspace(1) %ptr syncscope("agent") release, align 4, !mmra !0
+ ret void
+}
+
+define void @system_release_store_av_none(ptr addrspace(1) %ptr, i32 %val) {
+; GFX90A-LABEL: system_release_store_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: system_release_store_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: system_release_store_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: system_release_store_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ store atomic i32 %val, ptr addrspace(1) %ptr release, align 4, !mmra !0
+ ret void
+}
+
+; Atomicrmw: agent acq_rel and system seq_cst.
+
+define i32 @agent_acq_rel_atomicrmw_av_none(ptr addrspace(1) %ptr, i32 %val) {
+; GFX90A-LABEL: agent_acq_rel_atomicrmw_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add v0, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: agent_acq_rel_atomicrmw_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_atomic_add v0, v[0:1], v2, off glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: agent_acq_rel_atomicrmw_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: agent_acq_rel_atomicrmw_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = atomicrmw add ptr addrspace(1) %ptr, i32 %val syncscope("agent") acq_rel, !mmra !0
+ ret i32 %ret
+}
+
+define i32 @system_seq_cst_atomicrmw_av_none(ptr addrspace(1) %ptr, i32 %val) {
+; GFX90A-LABEL: system_seq_cst_atomicrmw_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add v0, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: system_seq_cst_atomicrmw_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_atomic_add v0, v[0:1], v2, off glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: system_seq_cst_atomicrmw_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: system_seq_cst_atomicrmw_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = atomicrmw add ptr addrspace(1) %ptr, i32 %val seq_cst, !mmra !0
+ ret i32 %ret
+}
+
+; Cmpxchg: cluster acq_rel.
+
+define { i32, i1 } @cluster_acq_rel_cmpxchg_av_none(ptr addrspace(1) %ptr, i32 %cmp, i32 %new) {
+; GFX90A-LABEL: cluster_acq_rel_cmpxchg_av_none:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_cmpswap v0, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-TGSPLIT-LABEL: cluster_acq_rel_cmpxchg_av_none:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[0:1], v[4:5], off glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-WGP-LABEL: cluster_acq_rel_cmpxchg_av_none:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_wait_expcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v1
+; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s0, v0, v2
+; GFX12-WGP-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-CU-LABEL: cluster_acq_rel_cmpxchg_av_none:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_expcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v4, v1
+; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX12-CU-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s0, v0, v2
+; GFX12-CU-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = cmpxchg ptr addrspace(1) %ptr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire, !mmra !0
+ ret { i32, i1 } %ret
+}
+
+!0 = !{!"amdgcn-av", !"none"}
>From 34967784d182126dc9667cf0c68363bff6b6a3db Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Wed, 27 May 2026 14:30:09 +0530
Subject: [PATCH 2/3] diagnose unknown metadata
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 24 ++++++++++++++++---
.../AMDGPU/memory-legalizer-av-unknown.ll | 11 +++++++++
2 files changed, 32 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3ff68da16630a..4ed98ec826692 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -458,8 +458,8 @@ class SICacheControl {
/// Inserts writeback (unless \p IsAVNone) followed by an unconditional wait.
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
- bool Changed = insertWriteback(MI, Scope, AddrSpace, Pos);
+ Position Pos, bool IsAVNone) const {
+ bool Changed = !IsAVNone && insertWriteback(MI, Scope, AddrSpace, Pos);
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos,
AtomicOrdering::Release, /*AtomicsOnly=*/false);
@@ -735,11 +735,29 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
return Result;
}
+static void diagnoseUnknownAVMetadata(const MachineInstr &MI,
+ StringRef Suffix) {
+ const MachineFunction *MF = MI.getMF();
+ const Function &Fn = MF->getFunction();
+ SmallString<128> Str;
+ raw_svector_ostream OS(Str);
+ OS << "unknown amdgcn-av metadata '" << Suffix << '\'';
+ Fn.getContext().diagnose(
+ DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
+}
+
static bool hasAVNoneMMRA(const MachineInstr &MI) {
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
if (!MMRA)
return false;
- return MMRA.hasTag("amdgcn-av", "none");
+ for (const auto &[Prefix, Suffix] : MMRA) {
+ if (Prefix != "amdgcn-av")
+ continue;
+ if (Suffix == "none")
+ return true;
+ diagnoseUnknownAVMetadata(MI, Suffix);
+ }
+ return false;
}
} // end anonymous namespace
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll
new file mode 100644
index 0000000000000..fd5216f4b13d8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -o /dev/null %s 2>&1 | FileCheck %s
+
+; CHECK: warning: {{.*}}: unknown amdgcn-av metadata 'bogus'
+
+define void @test_unknown_av() {
+entry:
+ fence seq_cst, !mmra !0
+ ret void
+}
+
+!0 = !{!"amdgcn-av", !"bogus"}
>From 3d9cc99fb862ca27bf97f78fd95ccf1d8aa62275 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Wed, 27 May 2026 15:06:40 +0530
Subject: [PATCH 3/3] always diagnose unknown metadata
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4ed98ec826692..75091e5dad179 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -750,14 +750,16 @@ static bool hasAVNoneMMRA(const MachineInstr &MI) {
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
if (!MMRA)
return false;
+ bool TagFound = false;
for (const auto &[Prefix, Suffix] : MMRA) {
if (Prefix != "amdgcn-av")
continue;
if (Suffix == "none")
- return true;
- diagnoseUnknownAVMetadata(MI, Suffix);
+ TagFound = true;
+ else
+ diagnoseUnknownAVMetadata(MI, Suffix);
}
- return false;
+ return TagFound;
}
} // end anonymous namespace
More information about the llvm-branch-commits
mailing list