[llvm] AtomicExpand: Copy metadata from atomicrmw to cmpxchg (PR #109409)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 31 11:51:29 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109409
>From 4b8e77eb477ac53140af0adff12cf3e48f214400 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 20 Sep 2024 13:28:29 +0400
Subject: [PATCH 1/2] AtomicExpand: Copy metadata from atomicrmw to cmpxchg
When expanding an atomicrmw with a cmpxchg, preserve any metadata
attached to it. This will avoid unwanted double expansions
in a future commit.
The initial load should also probably receive the same metadata
(which for some reason is not emitted as an atomic).
---
llvm/include/llvm/CodeGen/AtomicExpandUtils.h | 9 +-
llvm/lib/CodeGen/AtomicExpandPass.cpp | 29 +-
.../AMDGPU/expand-atomic-f32-agent.ll | 386 +++++++++---------
.../AMDGPU/expand-atomic-f32-system.ll | 342 ++++++++--------
.../AMDGPU/expand-atomic-f64-agent.ll | 136 +++---
.../AMDGPU/expand-atomic-f64-system.ll | 136 +++---
.../AMDGPU/expand-atomic-i32-agent.ll | 6 +-
.../AMDGPU/expand-atomic-i32-system.ll | 30 +-
.../AMDGPU/expand-atomic-i64-agent.ll | 6 +-
.../AMDGPU/expand-atomic-i64-system.ll | 30 +-
...and-atomic-rmw-fadd-flat-specialization.ll | 8 +-
.../AMDGPU/expand-atomic-rmw-fadd.ll | 174 ++++----
.../AMDGPU/expand-atomic-rmw-fmax.ll | 4 +-
.../AMDGPU/expand-atomic-rmw-fmin.ll | 4 +-
.../AMDGPU/expand-atomic-rmw-fsub.ll | 4 +-
.../AMDGPU/expand-atomic-v2bf16-agent.ll | 186 +++++----
.../AMDGPU/expand-atomic-v2bf16-system.ll | 176 ++++----
.../AMDGPU/expand-atomic-v2f16-agent.ll | 168 ++++----
.../AMDGPU/expand-atomic-v2f16-system.ll | 158 +++----
...expand-atomicrmw-flat-noalias-addrspace.ll | 116 +++---
20 files changed, 1081 insertions(+), 1027 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
index 1cb410a0c31c69..feb05de20b4571 100644
--- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
+++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -20,10 +20,11 @@ class Value;
/// Parameters (see the expansion example below):
/// (the builder, %addr, %loaded, %new_val, ordering,
-/// /* OUT */ %success, /* OUT */ %new_loaded)
-using CreateCmpXchgInstFun =
- function_ref<void(IRBuilderBase &, Value *, Value *, Value *, Align,
- AtomicOrdering, SyncScope::ID, Value *&, Value *&)>;
+/// /* OUT */ %success, /* OUT */ %new_loaded,
+/// %MetadataSrc)
+using CreateCmpXchgInstFun = function_ref<void(
+ IRBuilderBase &, Value *, Value *, Value *, Align, AtomicOrdering,
+ SyncScope::ID, Value *&, Value *&, Instruction *)>;
/// Expand an atomic RMW instruction into a loop utilizing
/// cmpxchg. You'll want to make sure your target machine likes cmpxchg
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3d4e2cb196a16a..b5eca44cb611a3 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -98,7 +98,7 @@ class AtomicExpandImpl {
IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
- CreateCmpXchgInstFun CreateCmpXchg);
+ CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
@@ -600,7 +600,8 @@ void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Value *Loaded, Value *NewVal, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
- Value *&Success, Value *&NewLoaded) {
+ Value *&Success, Value *&NewLoaded,
+ Instruction *MetadataSrc) {
Type *OrigTy = NewVal->getType();
// This code can go away when cmpxchg supports FP and vector types.
@@ -612,9 +613,12 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Loaded = Builder.CreateBitCast(Loaded, IntTy);
}
- Value *Pair = Builder.CreateAtomicCmpXchg(
+ AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+ if (MetadataSrc)
+ Pair->copyMetadata(*MetadataSrc);
+
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
@@ -951,9 +955,9 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
Value *OldResult;
if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
- OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
- PMV.AlignedAddrAlignment, MemOpOrder, SSID,
- PerformPartwordOp, createCmpXchgInstFun);
+ OldResult = insertRMWCmpXchgLoop(
+ Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment,
+ MemOpOrder, SSID, PerformPartwordOp, createCmpXchgInstFun, AI);
} else {
assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
@@ -981,6 +985,7 @@ static void copyMetadataForAtomic(Instruction &Dest,
case LLVMContext::MD_tbaa_struct:
case LLVMContext::MD_alias_scope:
case LLVMContext::MD_noalias:
+ case LLVMContext::MD_noalias_addrspace:
case LLVMContext::MD_access_group:
case LLVMContext::MD_mmra:
Dest.setMetadata(ID, N);
@@ -1591,7 +1596,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
- CreateCmpXchgInstFun CreateCmpXchg) {
+ CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
LLVMContext &Ctx = Builder.getContext();
BasicBlock *BB = Builder.GetInsertBlock();
Function *F = BB->getParent();
@@ -1637,7 +1642,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
MemOpOrder == AtomicOrdering::Unordered
? AtomicOrdering::Monotonic
: MemOpOrder,
- SSID, Success, NewLoaded);
+ SSID, Success, NewLoaded, MetadataSrc);
assert(Success && NewLoaded);
Loaded->addIncoming(NewLoaded, LoopBB);
@@ -1686,7 +1691,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
AI->getValOperand());
},
- CreateCmpXchg);
+ CreateCmpXchg, /*MetadataSrc=*/AI);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
@@ -1838,11 +1843,15 @@ void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
expandAtomicRMWToCmpXchg(
I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,
Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
- SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {
+ SyncScope::ID SSID, Value *&Success, Value *&NewLoaded,
+ Instruction *MetadataSrc) {
// Create the CAS instruction normally...
AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
Addr, Loaded, NewVal, Alignment, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+ if (MetadataSrc)
+ Pair->copyMetadata(*MetadataSrc);
+
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index d3fb9d8ee522e7..443c5d18e68949 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -187,7 +187,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -204,7 +204,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -221,7 +221,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -238,7 +238,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -260,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -292,7 +292,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -309,7 +309,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -326,7 +326,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -343,7 +343,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -382,7 +382,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -409,7 +409,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -426,7 +426,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -443,7 +443,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -460,7 +460,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -482,7 +482,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -514,7 +514,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -531,7 +531,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -548,7 +548,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -565,7 +565,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -587,7 +587,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -619,7 +619,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -636,7 +636,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -653,7 +653,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -670,7 +670,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -692,7 +692,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -724,7 +724,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -741,7 +741,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -758,7 +758,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -775,7 +775,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -797,7 +797,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -814,7 +814,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -841,7 +841,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -858,7 +858,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -875,7 +875,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -902,7 +902,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -934,7 +934,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -951,7 +951,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -968,7 +968,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -985,7 +985,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1007,7 +1007,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1024,7 +1024,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1051,7 +1051,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1068,7 +1068,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1085,7 +1085,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1112,7 +1112,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1144,7 +1144,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1161,7 +1161,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1178,7 +1178,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1205,7 +1205,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1237,7 +1237,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1254,7 +1254,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1271,7 +1271,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1298,7 +1298,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1451,7 +1451,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1468,7 +1468,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1485,7 +1485,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1502,7 +1502,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1524,7 +1524,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1556,7 +1556,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1573,7 +1573,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1590,7 +1590,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1607,7 +1607,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1629,7 +1629,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1646,7 +1646,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1673,7 +1673,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1690,7 +1690,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1707,7 +1707,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1724,7 +1724,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1746,7 +1746,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1778,7 +1778,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1795,7 +1795,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1812,7 +1812,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1829,7 +1829,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1851,7 +1851,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1883,7 +1883,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1900,7 +1900,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1917,7 +1917,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1934,7 +1934,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1956,7 +1956,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1988,7 +1988,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2005,7 +2005,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2022,7 +2022,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2039,7 +2039,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2061,7 +2061,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2078,7 +2078,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2105,7 +2105,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2122,7 +2122,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2154,7 +2154,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2186,7 +2186,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2203,7 +2203,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2220,7 +2220,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2237,7 +2237,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2259,7 +2259,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2276,7 +2276,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2303,7 +2303,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2320,7 +2320,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2352,7 +2352,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2384,7 +2384,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2401,7 +2401,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2433,7 +2433,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2465,7 +2465,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2482,7 +2482,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2514,7 +2514,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2572,7 +2572,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2594,7 +2594,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2616,7 +2616,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2638,7 +2638,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2660,7 +2660,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2682,7 +2682,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2704,7 +2704,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2859,7 +2859,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2876,7 +2876,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2893,7 +2893,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2910,7 +2910,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2927,7 +2927,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2964,7 +2964,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2981,7 +2981,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2998,7 +2998,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3015,7 +3015,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3032,7 +3032,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3049,7 +3049,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3066,7 +3066,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3093,7 +3093,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3110,7 +3110,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3127,7 +3127,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3144,7 +3144,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3161,7 +3161,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3198,7 +3198,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3215,7 +3215,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3232,7 +3232,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3249,7 +3249,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3266,7 +3266,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3283,7 +3283,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3300,7 +3300,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3327,7 +3327,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3344,7 +3344,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3361,7 +3361,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3378,7 +3378,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3395,7 +3395,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3432,7 +3432,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3449,7 +3449,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3466,7 +3466,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3483,7 +3483,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3500,7 +3500,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3517,7 +3517,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3534,7 +3534,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3561,7 +3561,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3578,7 +3578,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3595,7 +3595,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3612,7 +3612,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3629,7 +3629,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3799,7 +3799,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3816,7 +3816,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3833,7 +3833,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3850,7 +3850,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3867,7 +3867,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3904,7 +3904,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3921,7 +3921,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3938,7 +3938,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3955,7 +3955,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3972,7 +3972,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3989,7 +3989,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -4006,7 +4006,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -4033,7 +4033,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4050,7 +4050,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4067,7 +4067,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4084,7 +4084,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4101,7 +4101,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4138,7 +4138,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4155,7 +4155,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4172,7 +4172,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4189,7 +4189,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4206,7 +4206,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4223,7 +4223,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4240,7 +4240,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4267,7 +4267,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4284,7 +4284,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4301,7 +4301,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4318,7 +4318,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4335,7 +4335,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4372,7 +4372,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4389,7 +4389,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4406,7 +4406,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4423,7 +4423,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4440,7 +4440,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4457,7 +4457,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -4474,7 +4474,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -4501,7 +4501,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4518,7 +4518,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4535,7 +4535,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4552,7 +4552,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4569,7 +4569,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index d48e7317abb5da..82b029285cacdd 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -92,7 +92,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -109,7 +109,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -126,7 +126,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -143,7 +143,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -165,7 +165,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -197,7 +197,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -214,7 +214,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -231,7 +231,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -248,7 +248,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -270,7 +270,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -287,7 +287,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -314,7 +314,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -331,7 +331,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -348,7 +348,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -387,7 +387,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -419,7 +419,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -436,7 +436,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -453,7 +453,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -470,7 +470,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -492,7 +492,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -524,7 +524,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -541,7 +541,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -558,7 +558,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -575,7 +575,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -597,7 +597,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -629,7 +629,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -651,7 +651,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -668,7 +668,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -685,7 +685,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -712,7 +712,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -744,7 +744,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -761,7 +761,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -778,7 +778,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -795,7 +795,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -817,7 +817,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -834,7 +834,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -861,7 +861,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -878,7 +878,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -895,7 +895,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -922,7 +922,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -954,7 +954,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -971,7 +971,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -988,7 +988,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1015,7 +1015,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1047,7 +1047,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1064,7 +1064,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1081,7 +1081,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1108,7 +1108,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1166,7 +1166,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1183,7 +1183,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1200,7 +1200,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1217,7 +1217,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1239,7 +1239,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1271,7 +1271,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1288,7 +1288,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1305,7 +1305,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1322,7 +1322,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1344,7 +1344,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1361,7 +1361,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1388,7 +1388,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1405,7 +1405,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1422,7 +1422,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1439,7 +1439,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1461,7 +1461,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1493,7 +1493,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1510,7 +1510,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1527,7 +1527,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1544,7 +1544,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1566,7 +1566,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1598,7 +1598,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1615,7 +1615,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1632,7 +1632,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1649,7 +1649,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1671,7 +1671,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1703,7 +1703,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; COMMON-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1725,7 +1725,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1742,7 +1742,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1774,7 +1774,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1806,7 +1806,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1823,7 +1823,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1840,7 +1840,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1857,7 +1857,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1879,7 +1879,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1896,7 +1896,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1923,7 +1923,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1940,7 +1940,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1972,7 +1972,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2004,7 +2004,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2021,7 +2021,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2053,7 +2053,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2085,7 +2085,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2102,7 +2102,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2134,7 +2134,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2192,7 +2192,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2214,7 +2214,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2236,7 +2236,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2258,7 +2258,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2280,7 +2280,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2302,7 +2302,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2324,7 +2324,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2372,7 +2372,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2389,7 +2389,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2406,7 +2406,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2423,7 +2423,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2440,7 +2440,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2477,7 +2477,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2494,7 +2494,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2511,7 +2511,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2528,7 +2528,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2545,7 +2545,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2562,7 +2562,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2579,7 +2579,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2606,7 +2606,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2623,7 +2623,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2640,7 +2640,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2657,7 +2657,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2674,7 +2674,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2711,7 +2711,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2733,7 +2733,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2750,7 +2750,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2767,7 +2767,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2784,7 +2784,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2801,7 +2801,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2838,7 +2838,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2855,7 +2855,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2872,7 +2872,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2889,7 +2889,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2906,7 +2906,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2923,7 +2923,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2940,7 +2940,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2967,7 +2967,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2984,7 +2984,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3001,7 +3001,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3018,7 +3018,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3035,7 +3035,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3098,7 +3098,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3115,7 +3115,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3132,7 +3132,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3149,7 +3149,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3166,7 +3166,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3203,7 +3203,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3220,7 +3220,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3237,7 +3237,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3254,7 +3254,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3271,7 +3271,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3288,7 +3288,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3305,7 +3305,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3332,7 +3332,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3349,7 +3349,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3366,7 +3366,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3383,7 +3383,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3400,7 +3400,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3437,7 +3437,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3459,7 +3459,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3476,7 +3476,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3493,7 +3493,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3510,7 +3510,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3527,7 +3527,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3564,7 +3564,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3581,7 +3581,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3598,7 +3598,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3615,7 +3615,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3632,7 +3632,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3649,7 +3649,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3666,7 +3666,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3693,7 +3693,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3710,7 +3710,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3727,7 +3727,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3744,7 +3744,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3761,7 +3761,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index 19b02a364ac11a..af9933fa9e7260 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -199,7 +199,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -216,7 +216,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -233,7 +233,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -260,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -277,7 +277,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -294,7 +294,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -316,7 +316,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -333,7 +333,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -350,7 +350,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -367,7 +367,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -389,7 +389,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -406,7 +406,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -423,7 +423,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -445,7 +445,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -462,7 +462,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -479,7 +479,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -506,7 +506,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -523,7 +523,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -540,7 +540,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -562,7 +562,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -579,7 +579,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -596,7 +596,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -623,7 +623,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -640,7 +640,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -657,7 +657,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -679,7 +679,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -696,7 +696,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -713,7 +713,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -740,7 +740,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -757,7 +757,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -774,7 +774,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1008,7 +1008,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1030,7 +1030,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1052,7 +1052,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1331,7 +1331,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1348,7 +1348,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1365,7 +1365,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1397,7 +1397,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1414,7 +1414,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1436,7 +1436,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1453,7 +1453,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1470,7 +1470,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1487,7 +1487,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1509,7 +1509,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1526,7 +1526,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1543,7 +1543,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1565,7 +1565,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1582,7 +1582,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1599,7 +1599,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1631,7 +1631,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1648,7 +1648,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1927,7 +1927,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1944,7 +1944,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1961,7 +1961,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1993,7 +1993,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2010,7 +2010,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2032,7 +2032,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2049,7 +2049,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2066,7 +2066,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2083,7 +2083,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -2105,7 +2105,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -2122,7 +2122,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2139,7 +2139,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2161,7 +2161,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2178,7 +2178,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2195,7 +2195,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2227,7 +2227,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2244,7 +2244,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index e56417167c33b0..d01dd2eb29538e 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -92,7 +92,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -109,7 +109,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -126,7 +126,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -153,7 +153,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -170,7 +170,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -187,7 +187,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -209,7 +209,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -226,7 +226,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -243,7 +243,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -260,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -282,7 +282,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -299,7 +299,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -316,7 +316,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -338,7 +338,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -355,7 +355,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -372,7 +372,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -399,7 +399,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -416,7 +416,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -433,7 +433,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -455,7 +455,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -472,7 +472,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -489,7 +489,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -516,7 +516,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -533,7 +533,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -550,7 +550,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -572,7 +572,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -589,7 +589,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -606,7 +606,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -633,7 +633,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -650,7 +650,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -667,7 +667,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -901,7 +901,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -923,7 +923,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(pt
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -945,7 +945,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1117,7 +1117,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1134,7 +1134,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1151,7 +1151,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1183,7 +1183,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1200,7 +1200,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1222,7 +1222,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1239,7 +1239,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1256,7 +1256,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1273,7 +1273,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1295,7 +1295,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1312,7 +1312,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1329,7 +1329,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1351,7 +1351,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1368,7 +1368,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1385,7 +1385,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1417,7 +1417,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1434,7 +1434,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1606,7 +1606,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1623,7 +1623,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1640,7 +1640,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1672,7 +1672,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1689,7 +1689,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1711,7 +1711,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1728,7 +1728,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1745,7 +1745,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1762,7 +1762,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1784,7 +1784,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1801,7 +1801,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1818,7 +1818,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1840,7 +1840,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1857,7 +1857,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1874,7 +1874,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1906,7 +1906,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1923,7 +1923,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
index 5dbf2f6e696e1a..c3a0a4192ff17d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
@@ -228,7 +228,7 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory(
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -248,7 +248,7 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_remote_memory(ptr ad
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -268,7 +268,7 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory_
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
index 175f75634e706a..be3aaeb1706734 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
@@ -228,7 +228,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -248,7 +248,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory(ptr a
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -268,7 +268,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -403,7 +403,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory(
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -423,7 +423,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory(ptr ad
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -443,7 +443,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory_
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -488,7 +488,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory(
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -508,7 +508,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory(ptr ad
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -528,7 +528,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory_
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -573,7 +573,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -593,7 +593,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory(ptr a
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -613,7 +613,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -658,7 +658,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -678,7 +678,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory(ptr a
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -698,7 +698,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
index bd37f5ba88c6bd..77fe5e2aba9137 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
@@ -228,7 +228,7 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory(
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -248,7 +248,7 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_remote_memory(ptr ad
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -268,7 +268,7 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory_
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
index ecb898d120dd18..bd2aa846efb210 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
@@ -228,7 +228,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -248,7 +248,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory(ptr a
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -268,7 +268,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -403,7 +403,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory(
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -423,7 +423,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory(ptr ad
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -443,7 +443,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory_
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -488,7 +488,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory(
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -508,7 +508,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory(ptr ad
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -528,7 +528,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory_
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -573,7 +573,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -593,7 +593,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory(ptr a
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -613,7 +613,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -658,7 +658,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -678,7 +678,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory(ptr a
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
@@ -698,7 +698,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory
; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]]
-; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 056eee5b987d65..9c879e4bbf7583 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -13,7 +13,7 @@ define float @syncscope_system(ptr %addr, float %val) {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -83,7 +83,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -150,7 +150,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
; GFX908-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
; GFX908: atomicrmw.shared:
; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(3)
-; GFX908-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]]
; GFX908: atomicrmw.check.private:
; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[ADDR]])
@@ -303,7 +303,7 @@ define float @flat_atomicrmw_fadd_f32__align32(ptr %addr, float %val) {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index d55f7ca9f2baae..71311217eee8c1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -282,7 +282,7 @@ define float @test_atomicrmw_fadd_f32_global_unsafe(ptr addrspace(1) %ptr, float
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -298,7 +298,7 @@ define float @test_atomicrmw_fadd_f32_global_unsafe(ptr addrspace(1) %ptr, float
; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -314,7 +314,7 @@ define float @test_atomicrmw_fadd_f32_global_unsafe(ptr addrspace(1) %ptr, float
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -347,7 +347,7 @@ define float @test_atomicrmw_fadd_f32_buffer_fat_ptr_unsafe(ptr addrspace(7) %pt
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -363,7 +363,7 @@ define float @test_atomicrmw_fadd_f32_buffer_fat_ptr_unsafe(ptr addrspace(7) %pt
; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -379,7 +379,7 @@ define float @test_atomicrmw_fadd_f32_buffer_fat_ptr_unsafe(ptr addrspace(7) %pt
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -412,7 +412,7 @@ define float @test_atomicrmw_fadd_f32_as999_unsafe(ptr addrspace(999) %ptr, floa
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -428,7 +428,7 @@ define float @test_atomicrmw_fadd_f32_as999_unsafe(ptr addrspace(999) %ptr, floa
; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -444,7 +444,7 @@ define float @test_atomicrmw_fadd_f32_as999_unsafe(ptr addrspace(999) %ptr, floa
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -477,7 +477,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub
; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -493,7 +493,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub
; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -509,7 +509,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -533,7 +533,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -554,7 +554,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 {
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -570,7 +570,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 {
; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -586,7 +586,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -641,7 +641,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe__noprivate(ptr %ptr, double %
; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -657,7 +657,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe__noprivate(ptr %ptr, double %
; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -673,7 +673,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe__noprivate(ptr %ptr, double %
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -697,7 +697,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe__noprivate(ptr %ptr, double %
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -727,7 +727,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 {
; CI-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8
+; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -757,7 +757,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 {
; GFX9-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -787,7 +787,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 {
; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -853,7 +853,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 {
; GFX11-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -984,7 +984,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1)
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1000,7 +1000,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1)
; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1016,7 +1016,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1)
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1032,7 +1032,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1)
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1244,7 +1244,7 @@ define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) {
; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1270,7 +1270,7 @@ define double @test_atomicrmw_fadd_f64_flat__noprivate(ptr %ptr, double %value)
; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1499,7 +1499,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(ptr addrspace
; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1515,7 +1515,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(ptr addrspace
; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1524,7 +1524,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(ptr addrspace
; GFX9-NEXT: ret void
;
; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(
@@ -1552,7 +1552,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1568,7 +1568,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1584,7 +1584,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1608,7 +1608,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2056,7 +2056,7 @@ define void @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2077,7 +2077,7 @@ define float @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2140,7 +2140,7 @@ define void @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_deno
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2161,7 +2161,7 @@ define float @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denor
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2182,7 +2182,7 @@ define void @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignor
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2203,7 +2203,7 @@ define float @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2306,7 +2306,7 @@ define void @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(pt
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2315,7 +2315,7 @@ define void @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(pt
; CI-NEXT: ret void
;
; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(
-; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
+; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX9-NEXT: ret void
;
; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(
@@ -2347,7 +2347,7 @@ define float @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode(ptr
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2430,7 +2430,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignor
; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2451,7 +2451,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignor
; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2626,7 +2626,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2642,7 +2642,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2658,7 +2658,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2682,7 +2682,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2703,7 +2703,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2719,7 +2719,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2735,7 +2735,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2759,7 +2759,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2780,7 +2780,7 @@ define void @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_m
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2801,7 +2801,7 @@ define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mo
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2831,7 +2831,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_
; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8
+; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2865,7 +2865,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_
; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8
+; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.ignore.denormal.mode [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -4221,7 +4221,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(
; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4237,7 +4237,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4253,7 +4253,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4277,7 +4277,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4298,7 +4298,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4314,7 +4314,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4342,7 +4342,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4363,7 +4363,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2
; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4379,7 +4379,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4395,7 +4395,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4411,7 +4411,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4431,7 +4431,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4452,7 +4452,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x
; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4468,7 +4468,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4484,7 +4484,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4500,7 +4500,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4520,7 +4520,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4541,7 +4541,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa
; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4557,7 +4557,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4573,7 +4573,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4589,7 +4589,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4609,7 +4609,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4630,7 +4630,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace
; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4646,7 +4646,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4662,7 +4662,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4678,7 +4678,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4698,7 +4698,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 3cb0165075e821..7859b8bb40734e 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -204,7 +204,7 @@ define double @test_atomicrmw_fmax_f64_flat(ptr %ptr, double %value) {
; GCN-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GCN-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8
+; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]]
; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GCN-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -230,7 +230,7 @@ define double @test_atomicrmw_fmax_f64_flat__noprivate(ptr %ptr, double %value)
; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]])
; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index 3ab28af0872c4f..315af40ce3201d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -204,7 +204,7 @@ define double @test_atomicrmw_fmin_f64_flat(ptr %ptr, double %value) {
; GCN-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GCN-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8
+; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]]
; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GCN-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -230,7 +230,7 @@ define double @test_atomicrmw_fmin_f64_flat__noprivate(ptr %ptr, double %value)
; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]])
; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
index 4c22d830f7a1c1..b4e999e58d0151 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
@@ -207,7 +207,7 @@ define double @test_atomicrmw_fsub_f64_flat__noprivate(ptr %ptr, double %value)
; GCN-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE:%.*]]
; GCN-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]]
; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GCN-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -237,7 +237,7 @@ define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) {
; GCN-NEXT: [[NEW2:%.*]] = fsub double [[LOADED]], [[VALUE]]
; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
index 26b84f82524e26..fa77ab87e6535d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
@@ -167,7 +167,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -184,7 +184,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -201,7 +201,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -218,7 +218,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -240,7 +240,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -257,7 +257,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -284,7 +284,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -301,7 +301,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -318,7 +318,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -335,7 +335,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -357,7 +357,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -374,7 +374,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -401,7 +401,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -418,7 +418,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -435,7 +435,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -452,7 +452,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -474,7 +474,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -491,7 +491,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -518,7 +518,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -535,7 +535,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -552,7 +552,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -569,7 +569,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -591,7 +591,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -608,7 +608,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -635,7 +635,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -652,7 +652,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -669,7 +669,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -686,7 +686,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -708,7 +708,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -725,7 +725,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -752,7 +752,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -769,7 +769,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -786,7 +786,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -803,7 +803,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -825,7 +825,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -842,7 +842,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -869,7 +869,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -886,7 +886,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -903,7 +903,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -920,7 +920,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -942,7 +942,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -959,7 +959,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -986,7 +986,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1003,7 +1003,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1020,7 +1020,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1037,7 +1037,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1059,7 +1059,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1076,7 +1076,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1103,7 +1103,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1120,7 +1120,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1137,7 +1137,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1154,7 +1154,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1176,7 +1176,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1193,7 +1193,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1220,7 +1220,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1237,7 +1237,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1254,7 +1254,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1271,7 +1271,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1293,7 +1293,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1310,7 +1310,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1337,7 +1337,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1354,7 +1354,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1371,7 +1371,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1388,7 +1388,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1410,7 +1410,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1427,7 +1427,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1480,7 +1480,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_gra
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1502,7 +1502,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_m
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1524,7 +1524,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_gra
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1546,7 +1546,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1568,7 +1568,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1590,7 +1590,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1612,7 +1612,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1660,7 +1660,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_gra
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1682,7 +1682,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_m
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1704,7 +1704,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_gra
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1726,7 +1726,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1748,7 +1748,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1770,7 +1770,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1792,7 +1792,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1840,7 +1840,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_gra
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1862,7 +1862,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_m
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1884,7 +1884,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_gra
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1906,7 +1906,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1928,7 +1928,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1950,7 +1950,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1972,7 +1972,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1989,7 +1989,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
; GFX940: [[META0]] = !{}
;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
; GFX12: [[META0]] = !{}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
index a2440def73aba8..5660b8f3544301 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
@@ -72,7 +72,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -89,7 +89,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -106,7 +106,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -123,7 +123,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -145,7 +145,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -162,7 +162,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -189,7 +189,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -206,7 +206,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -223,7 +223,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -240,7 +240,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -262,7 +262,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -279,7 +279,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -306,7 +306,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -323,7 +323,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -340,7 +340,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -357,7 +357,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -379,7 +379,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -396,7 +396,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -423,7 +423,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -440,7 +440,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -457,7 +457,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -474,7 +474,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -496,7 +496,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -513,7 +513,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -540,7 +540,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -557,7 +557,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -574,7 +574,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -591,7 +591,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -613,7 +613,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -630,7 +630,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -657,7 +657,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -679,7 +679,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -696,7 +696,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -713,7 +713,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -730,7 +730,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -752,7 +752,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -769,7 +769,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -796,7 +796,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -813,7 +813,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -830,7 +830,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -847,7 +847,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -869,7 +869,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -886,7 +886,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -913,7 +913,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -930,7 +930,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -947,7 +947,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -964,7 +964,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -986,7 +986,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1003,7 +1003,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1030,7 +1030,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1047,7 +1047,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1064,7 +1064,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1081,7 +1081,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1103,7 +1103,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1120,7 +1120,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1147,7 +1147,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1164,7 +1164,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1181,7 +1181,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1198,7 +1198,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1220,7 +1220,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1237,7 +1237,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1290,7 +1290,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1312,7 +1312,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1334,7 +1334,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1356,7 +1356,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1378,7 +1378,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1400,7 +1400,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1422,7 +1422,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1470,7 +1470,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1492,7 +1492,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1514,7 +1514,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1536,7 +1536,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1558,7 +1558,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1580,7 +1580,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1602,7 +1602,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1650,7 +1650,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1672,7 +1672,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1694,7 +1694,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1716,7 +1716,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1738,7 +1738,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1760,7 +1760,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1782,7 +1782,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1799,7 +1799,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
; GFX940: [[META0]] = !{}
;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
; GFX12: [[META0]] = !{}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
index 9d396aad18f231..a69d3549bc90c8 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
@@ -167,7 +167,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -184,7 +184,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -201,7 +201,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -228,7 +228,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -245,7 +245,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -272,7 +272,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -289,7 +289,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -306,7 +306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -323,7 +323,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -345,7 +345,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -362,7 +362,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -389,7 +389,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -406,7 +406,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -423,7 +423,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -450,7 +450,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -467,7 +467,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -494,7 +494,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -511,7 +511,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -528,7 +528,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -555,7 +555,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -572,7 +572,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -599,7 +599,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -616,7 +616,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -633,7 +633,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -660,7 +660,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -677,7 +677,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -704,7 +704,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -721,7 +721,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -738,7 +738,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -755,7 +755,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -777,7 +777,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -794,7 +794,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -821,7 +821,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -838,7 +838,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -855,7 +855,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -882,7 +882,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -899,7 +899,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -926,7 +926,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -943,7 +943,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -960,7 +960,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -977,7 +977,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -999,7 +999,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1016,7 +1016,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1043,7 +1043,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1060,7 +1060,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1077,7 +1077,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1104,7 +1104,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1121,7 +1121,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1148,7 +1148,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1165,7 +1165,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1182,7 +1182,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1209,7 +1209,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1226,7 +1226,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1253,7 +1253,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1270,7 +1270,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1287,7 +1287,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1314,7 +1314,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1331,7 +1331,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1384,7 +1384,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_graine
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1406,7 +1406,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1428,7 +1428,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_graine
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1450,7 +1450,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1472,7 +1472,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1494,7 +1494,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1516,7 +1516,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1564,7 +1564,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_graine
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1586,7 +1586,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memo
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1608,7 +1608,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_graine
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1630,7 +1630,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1652,7 +1652,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1674,7 +1674,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1696,7 +1696,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1744,7 +1744,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_graine
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1766,7 +1766,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memo
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1788,7 +1788,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_graine
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1810,7 +1810,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1832,7 +1832,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1854,7 +1854,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1876,7 +1876,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1893,9 +1893,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
; GFX90A: [[META0]] = !{}
;.
; GFX940: [[META0]] = !{}
;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
; GFX12: [[META0]] = !{}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
index 29d9473073adba..f5a59e6bd519f4 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
@@ -72,7 +72,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -89,7 +89,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -106,7 +106,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -133,7 +133,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -150,7 +150,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -177,7 +177,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -194,7 +194,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -211,7 +211,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -228,7 +228,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -250,7 +250,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -267,7 +267,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -294,7 +294,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -311,7 +311,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -328,7 +328,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -355,7 +355,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -372,7 +372,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -399,7 +399,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -416,7 +416,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -433,7 +433,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -460,7 +460,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -477,7 +477,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -504,7 +504,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -521,7 +521,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -538,7 +538,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -565,7 +565,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -582,7 +582,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -609,7 +609,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -631,7 +631,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -648,7 +648,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -665,7 +665,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -692,7 +692,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -709,7 +709,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -736,7 +736,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -753,7 +753,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -770,7 +770,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -787,7 +787,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -809,7 +809,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -826,7 +826,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -853,7 +853,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -870,7 +870,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -887,7 +887,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -914,7 +914,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -931,7 +931,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -958,7 +958,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -975,7 +975,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -992,7 +992,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1019,7 +1019,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1036,7 +1036,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1063,7 +1063,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1080,7 +1080,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1097,7 +1097,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1124,7 +1124,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1141,7 +1141,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1194,7 +1194,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1216,7 +1216,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_mem
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1238,7 +1238,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1260,7 +1260,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1282,7 +1282,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1304,7 +1304,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1326,7 +1326,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1374,7 +1374,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1396,7 +1396,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_mem
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1418,7 +1418,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1440,7 +1440,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1462,7 +1462,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1484,7 +1484,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1506,7 +1506,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1554,7 +1554,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1576,7 +1576,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_mem
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1598,7 +1598,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1620,7 +1620,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1642,7 +1642,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1664,7 +1664,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1686,7 +1686,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1703,9 +1703,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
; GFX90A: [[META0]] = !{}
;.
; GFX940: [[META0]] = !{}
;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
; GFX12: [[META0]] = !{}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
index cb2ba0f7eb0b5d..2b9cff80172012 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
@@ -500,7 +500,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) {
; GFX7-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -531,7 +531,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) {
; GFX900-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX900-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX900-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -562,7 +562,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) {
; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -631,7 +631,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) {
; GFX12-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -658,7 +658,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -675,7 +675,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -692,7 +692,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -719,7 +719,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -741,7 +741,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi
; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -758,7 +758,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi
; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -775,7 +775,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -792,7 +792,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -814,7 +814,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -836,7 +836,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX7-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX7-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -853,7 +853,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -870,7 +870,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -929,7 +929,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr
; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -946,7 +946,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr
; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -963,7 +963,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -980,7 +980,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1012,7 +1012,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1029,7 +1029,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1046,7 +1046,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1063,7 +1063,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1127,7 +1127,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) {
; GFX900-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX900-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GFX900-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GFX900-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -1158,7 +1158,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) {
; GFX908-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GFX908-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GFX908-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -1227,7 +1227,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) {
; GFX12-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GFX12-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GFX12-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -1259,7 +1259,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1276,7 +1276,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1303,7 +1303,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1325,7 +1325,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi
; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1342,7 +1342,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi
; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1359,7 +1359,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi
; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1376,7 +1376,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi
; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1398,7 +1398,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi
; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1425,7 +1425,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1442,7 +1442,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1459,7 +1459,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float
@@ -1476,7 +1476,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -1535,7 +1535,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) {
; GFX900-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX900-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GFX900-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GFX900-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -1566,7 +1566,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) {
; GFX908-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GFX908-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GFX908-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -1635,7 +1635,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) {
; GFX12-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64
; GFX12-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0
; GFX12-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double
@@ -1667,7 +1667,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1684,7 +1684,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
@@ -1711,7 +1711,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr,
; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1733,7 +1733,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi
; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -1750,7 +1750,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi
; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1767,7 +1767,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi
; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1784,7 +1784,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi
; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1806,7 +1806,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi
; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -1833,7 +1833,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1850,7 +1850,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1867,7 +1867,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float
@@ -1884,7 +1884,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -1924,7 +1924,7 @@ define i64 @test_flat_atomicrmw_nand_i64_agent(ptr %ptr, i64 %value) {
; ALL-NEXT: [[LOADED1:%.*]] = phi i64 [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ]
; ALL-NEXT: [[TMP4:%.*]] = and i64 [[LOADED1]], [[VALUE]]
; ALL-NEXT: [[NEW2:%.*]] = xor i64 [[TMP4]], -1
-; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED1]], i64 [[NEW2]] syncscope("agent") seq_cst seq_cst, align 8
+; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED1]], i64 [[NEW2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; ALL-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP5]], 0
; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]]
@@ -1949,7 +1949,7 @@ define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5(ptr %ptr, i6
; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0
; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
@@ -1969,7 +1969,7 @@ define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_
; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1
-; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0
; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
@@ -1990,7 +1990,7 @@ define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i3
; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
; ALL-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
; ALL-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1
-; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; ALL-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP3]], 0
; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
>From df51d36b65bb9b2ca84855af02da24ab65d43b53 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 10 Oct 2024 15:09:33 +0400
Subject: [PATCH 2/2] Use copyMetadataForAtomic
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 68 +++---
.../AMDGPU/expand-atomic-f32-agent.ll | 208 +++++++++---------
.../AMDGPU/expand-atomic-f32-system.ll | 164 +++++++-------
...and-atomic-rmw-fadd-flat-specialization.ll | 6 +-
.../AMDGPU/expand-atomic-rmw-fadd.ll | 44 ++--
.../AMDGPU/expand-atomic-v2bf16-agent.ll | 96 ++++----
.../AMDGPU/expand-atomic-v2bf16-system.ll | 88 ++++----
.../AMDGPU/expand-atomic-v2f16-agent.ll | 88 ++++----
.../AMDGPU/expand-atomic-v2f16-system.ll | 80 +++----
...expand-atomicrmw-flat-noalias-addrspace.ll | 6 +-
10 files changed, 425 insertions(+), 423 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index b5eca44cb611a3..0aff4f1f5cf1cb 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -194,6 +194,39 @@ static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
}
+/// Copy metadata that's safe to preserve when widening atomics.
+static void copyMetadataForAtomic(Instruction &Dest,
+ const Instruction &Source) {
+ SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+ Source.getAllMetadata(MD);
+ LLVMContext &Ctx = Dest.getContext();
+ MDBuilder MDB(Ctx);
+
+ for (auto [ID, N] : MD) {
+ switch (ID) {
+ case LLVMContext::MD_dbg:
+ case LLVMContext::MD_tbaa:
+ case LLVMContext::MD_tbaa_struct:
+ case LLVMContext::MD_alias_scope:
+ case LLVMContext::MD_noalias:
+ case LLVMContext::MD_noalias_addrspace:
+ case LLVMContext::MD_access_group:
+ case LLVMContext::MD_mmra:
+ Dest.setMetadata(ID, N);
+ break;
+ default:
+ if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory"))
+ Dest.setMetadata(ID, N);
+ else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory"))
+ Dest.setMetadata(ID, N);
+
+ // Losing amdgpu.ignore.denormal.mode, but it doesn't matter for current
+ // uses.
+ break;
+ }
+ }
+}
+
// Determine if a particular atomic operation has a supported size,
// and is of appropriate alignment, to be passed through for target
// lowering. (Versus turning into a __atomic libcall)
@@ -617,7 +650,7 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
if (MetadataSrc)
- Pair->copyMetadata(*MetadataSrc);
+ copyMetadataForAtomic(*Pair, *MetadataSrc);
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
@@ -970,37 +1003,6 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
AI->eraseFromParent();
}
-/// Copy metadata that's safe to preserve when widening atomics.
-static void copyMetadataForAtomic(Instruction &Dest,
- const Instruction &Source) {
- SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
- Source.getAllMetadata(MD);
- LLVMContext &Ctx = Dest.getContext();
- MDBuilder MDB(Ctx);
-
- for (auto [ID, N] : MD) {
- switch (ID) {
- case LLVMContext::MD_dbg:
- case LLVMContext::MD_tbaa:
- case LLVMContext::MD_tbaa_struct:
- case LLVMContext::MD_alias_scope:
- case LLVMContext::MD_noalias:
- case LLVMContext::MD_noalias_addrspace:
- case LLVMContext::MD_access_group:
- case LLVMContext::MD_mmra:
- Dest.setMetadata(ID, N);
- break;
- default:
- if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory"))
- Dest.setMetadata(ID, N);
- else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory"))
- Dest.setMetadata(ID, N);
-
- break;
- }
- }
-}
-
// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
ReplacementIRBuilder Builder(AI, *DL);
@@ -1850,7 +1852,7 @@ void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
Addr, Loaded, NewVal, Alignment, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
if (MetadataSrc)
- Pair->copyMetadata(*MetadataSrc);
+ copyMetadataForAtomic(*Pair, *MetadataSrc);
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 443c5d18e68949..69cf9697d4b306 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -724,7 +724,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -741,7 +741,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -758,7 +758,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -775,7 +775,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -797,7 +797,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -814,7 +814,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -841,7 +841,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -858,7 +858,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -875,7 +875,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -902,7 +902,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -934,7 +934,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -951,7 +951,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -968,7 +968,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -985,7 +985,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1007,7 +1007,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1024,7 +1024,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1051,7 +1051,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1068,7 +1068,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1085,7 +1085,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1112,7 +1112,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1144,7 +1144,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1161,7 +1161,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1178,7 +1178,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1205,7 +1205,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1237,7 +1237,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1254,7 +1254,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1271,7 +1271,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1298,7 +1298,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1988,7 +1988,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2005,7 +2005,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2022,7 +2022,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2039,7 +2039,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2061,7 +2061,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2078,7 +2078,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2105,7 +2105,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2122,7 +2122,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2154,7 +2154,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2186,7 +2186,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2203,7 +2203,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2220,7 +2220,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2237,7 +2237,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2259,7 +2259,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2276,7 +2276,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2303,7 +2303,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2320,7 +2320,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2352,7 +2352,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2384,7 +2384,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2401,7 +2401,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2433,7 +2433,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2465,7 +2465,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2482,7 +2482,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2514,7 +2514,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2638,7 +2638,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2660,7 +2660,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2682,7 +2682,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2704,7 +2704,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -3198,7 +3198,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3215,7 +3215,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3232,7 +3232,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3249,7 +3249,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3266,7 +3266,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3283,7 +3283,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3300,7 +3300,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3327,7 +3327,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3344,7 +3344,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3361,7 +3361,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3378,7 +3378,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3395,7 +3395,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3432,7 +3432,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3449,7 +3449,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3466,7 +3466,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3483,7 +3483,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3500,7 +3500,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3517,7 +3517,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3534,7 +3534,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3561,7 +3561,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3578,7 +3578,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3595,7 +3595,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3612,7 +3612,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3629,7 +3629,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4138,7 +4138,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4155,7 +4155,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4172,7 +4172,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4189,7 +4189,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4206,7 +4206,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4223,7 +4223,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4240,7 +4240,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4267,7 +4267,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4284,7 +4284,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4301,7 +4301,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4318,7 +4318,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4335,7 +4335,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4372,7 +4372,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4389,7 +4389,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4406,7 +4406,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4423,7 +4423,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4440,7 +4440,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4457,7 +4457,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -4474,7 +4474,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -4501,7 +4501,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4518,7 +4518,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4535,7 +4535,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4552,7 +4552,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -4569,7 +4569,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 82b029285cacdd..0976022825ced3 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -629,7 +629,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -651,7 +651,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -668,7 +668,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -685,7 +685,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -712,7 +712,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -744,7 +744,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -761,7 +761,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -778,7 +778,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -795,7 +795,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -817,7 +817,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -834,7 +834,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -861,7 +861,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -878,7 +878,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -895,7 +895,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -922,7 +922,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -954,7 +954,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -971,7 +971,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -988,7 +988,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1015,7 +1015,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1047,7 +1047,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1064,7 +1064,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1081,7 +1081,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1108,7 +1108,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1703,7 +1703,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; COMMON-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1725,7 +1725,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1742,7 +1742,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1774,7 +1774,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1806,7 +1806,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1823,7 +1823,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1840,7 +1840,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1857,7 +1857,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1879,7 +1879,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1896,7 +1896,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1923,7 +1923,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1940,7 +1940,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -1972,7 +1972,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2004,7 +2004,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2021,7 +2021,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2053,7 +2053,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2085,7 +2085,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2102,7 +2102,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2134,7 +2134,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2258,7 +2258,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2280,7 +2280,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2302,7 +2302,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2324,7 +2324,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2711,7 +2711,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2733,7 +2733,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2750,7 +2750,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2767,7 +2767,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2784,7 +2784,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2801,7 +2801,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2838,7 +2838,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2855,7 +2855,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2872,7 +2872,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2889,7 +2889,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2906,7 +2906,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2923,7 +2923,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2940,7 +2940,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -2967,7 +2967,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -2984,7 +2984,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3001,7 +3001,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3018,7 +3018,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3035,7 +3035,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3437,7 +3437,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3459,7 +3459,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3476,7 +3476,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3493,7 +3493,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3510,7 +3510,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3527,7 +3527,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3564,7 +3564,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3581,7 +3581,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3598,7 +3598,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3615,7 +3615,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3632,7 +3632,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3649,7 +3649,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3666,7 +3666,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
@@ -3693,7 +3693,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3710,7 +3710,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3727,7 +3727,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3744,7 +3744,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
@@ -3761,7 +3761,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 9c879e4bbf7583..db01f221f29115 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -13,7 +13,7 @@ define float @syncscope_system(ptr %addr, float %val) {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -83,7 +83,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -303,7 +303,7 @@ define float @flat_atomicrmw_fadd_f32__align32(ptr %addr, float %val) {
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 71311217eee8c1..84d9a64efa0f7d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -2056,7 +2056,7 @@ define void @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2077,7 +2077,7 @@ define float @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2140,7 +2140,7 @@ define void @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_deno
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2161,7 +2161,7 @@ define float @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denor
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2182,7 +2182,7 @@ define void @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignor
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2203,7 +2203,7 @@ define float @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2306,7 +2306,7 @@ define void @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(pt
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2347,7 +2347,7 @@ define float @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode(ptr
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2430,7 +2430,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignor
; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2451,7 +2451,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignor
; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2626,7 +2626,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2642,7 +2642,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2658,7 +2658,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2682,7 +2682,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2703,7 +2703,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2719,7 +2719,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2735,7 +2735,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2759,7 +2759,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
@@ -2780,7 +2780,7 @@ define void @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_m
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2801,7 +2801,7 @@ define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mo
; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -2831,7 +2831,7 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_
; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
@@ -2865,7 +2865,7 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_
; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]]
; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64
; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.ignore.denormal.mode [[META0]]
+; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]]
; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
index fa77ab87e6535d..b4025c3cfb53c5 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
@@ -752,7 +752,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -769,7 +769,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -786,7 +786,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -803,7 +803,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -825,7 +825,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -842,7 +842,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -869,7 +869,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -886,7 +886,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -903,7 +903,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -920,7 +920,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -942,7 +942,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -959,7 +959,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -986,7 +986,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1003,7 +1003,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1020,7 +1020,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1037,7 +1037,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1059,7 +1059,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1076,7 +1076,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1103,7 +1103,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1120,7 +1120,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1137,7 +1137,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1154,7 +1154,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1176,7 +1176,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1193,7 +1193,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1220,7 +1220,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1237,7 +1237,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1254,7 +1254,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1271,7 +1271,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1293,7 +1293,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1310,7 +1310,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1337,7 +1337,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1354,7 +1354,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1371,7 +1371,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1388,7 +1388,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1410,7 +1410,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1427,7 +1427,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1546,7 +1546,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1568,7 +1568,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1590,7 +1590,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1612,7 +1612,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1726,7 +1726,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1748,7 +1748,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1770,7 +1770,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1792,7 +1792,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1906,7 +1906,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1928,7 +1928,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1950,7 +1950,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1972,7 +1972,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
index 5660b8f3544301..6700839d814800 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
@@ -657,7 +657,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -679,7 +679,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -696,7 +696,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -713,7 +713,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -730,7 +730,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -752,7 +752,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -769,7 +769,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -796,7 +796,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -813,7 +813,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -830,7 +830,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -847,7 +847,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -869,7 +869,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -886,7 +886,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -913,7 +913,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -930,7 +930,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -947,7 +947,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -964,7 +964,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -986,7 +986,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1003,7 +1003,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1030,7 +1030,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1047,7 +1047,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1064,7 +1064,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1081,7 +1081,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1103,7 +1103,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1120,7 +1120,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1147,7 +1147,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1164,7 +1164,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1181,7 +1181,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1198,7 +1198,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1220,7 +1220,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1237,7 +1237,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1290,7 +1290,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1356,7 +1356,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1378,7 +1378,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1400,7 +1400,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1422,7 +1422,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1536,7 +1536,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1558,7 +1558,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1580,7 +1580,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1602,7 +1602,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1716,7 +1716,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1738,7 +1738,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1760,7 +1760,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -1782,7 +1782,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
index a69d3549bc90c8..c28e11f60f389a 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
@@ -704,7 +704,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -721,7 +721,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -738,7 +738,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -755,7 +755,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -777,7 +777,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -794,7 +794,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -821,7 +821,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -838,7 +838,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -855,7 +855,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -882,7 +882,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -899,7 +899,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -926,7 +926,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -943,7 +943,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -960,7 +960,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -977,7 +977,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -999,7 +999,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1016,7 +1016,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1043,7 +1043,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1060,7 +1060,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1077,7 +1077,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1104,7 +1104,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1121,7 +1121,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1148,7 +1148,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1165,7 +1165,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1182,7 +1182,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1209,7 +1209,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1226,7 +1226,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1253,7 +1253,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1270,7 +1270,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1287,7 +1287,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1314,7 +1314,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1331,7 +1331,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1450,7 +1450,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1472,7 +1472,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1494,7 +1494,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1516,7 +1516,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1630,7 +1630,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1652,7 +1652,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1674,7 +1674,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1696,7 +1696,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1810,7 +1810,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1832,7 +1832,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1854,7 +1854,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1876,7 +1876,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
index f5a59e6bd519f4..19f02dea213297 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
@@ -609,7 +609,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -631,7 +631,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -648,7 +648,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -665,7 +665,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -692,7 +692,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -709,7 +709,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -736,7 +736,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -753,7 +753,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -770,7 +770,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -787,7 +787,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -809,7 +809,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -826,7 +826,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -853,7 +853,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -870,7 +870,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -887,7 +887,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -914,7 +914,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -931,7 +931,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -958,7 +958,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -975,7 +975,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -992,7 +992,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1019,7 +1019,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1036,7 +1036,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1063,7 +1063,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1080,7 +1080,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1097,7 +1097,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1124,7 +1124,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1141,7 +1141,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1194,7 +1194,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1260,7 +1260,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1282,7 +1282,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1304,7 +1304,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1326,7 +1326,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1440,7 +1440,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1462,7 +1462,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1484,7 +1484,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1506,7 +1506,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1620,7 +1620,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1642,7 +1642,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1664,7 +1664,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -1686,7 +1686,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
index 2b9cff80172012..9e9503dfbd3819 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
@@ -836,7 +836,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX7-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX7-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX7-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
+; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -853,7 +853,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX900-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -870,7 +870,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr,
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
More information about the llvm-commits
mailing list