[llvm-branch-commits] [llvm] AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (PR #95394)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jun 14 12:43:08 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/95394
>From a307ee030a6b46ffff17f06f00d91886c98f941c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 8 Jun 2024 09:52:14 +0200
Subject: [PATCH] AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for
global/flat
Unlike the existing fadd cases, choose to ignore the requirement for
amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This
is to minimize migration pain to the new atomic control metadata. This
should not break any users, as the atomic intrinsics are still
directly consumed, and clang does not yet produce vector FP atomicrmw.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 +
llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 29 +-
.../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 62 +-
.../AMDGPU/expand-atomic-rmw-fadd.ll | 1928 +++++++++++++----
.../AMDGPU/expand-atomic-v2bf16-agent.ll | 1540 +++++++++++--
.../AMDGPU/expand-atomic-v2bf16-system.ll | 1540 +++++++++++--
.../AMDGPU/expand-atomic-v2f16-agent.ll | 1398 ++++++++++--
.../AMDGPU/expand-atomic-v2f16-system.ll | 1398 ++++++++++--
.../AMDGPU/expand-atomicrmw-fp-vector.ll | 102 +-
10 files changed, 6663 insertions(+), 1347 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0234cd9088ae7..19f19eb7b7f44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
});
}
+ if (ST.hasAtomicBufferGlobalPkAddF16Insts())
+ Atomic.legalFor({{V2F16, GlobalPtr}});
+ if (ST.hasAtomicGlobalPkAddBF16Inst())
+ Atomic.legalFor({{V2BF16, GlobalPtr}});
+ if (ST.hasAtomicFlatPkAdd16Insts())
+ Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 818cbde592432..99e3a8084f067 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}
let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
@@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
}
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
+// FIXME: These do not have signed offsets
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
-
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e8c2773c33003..4e3ef6aad3062 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15941,6 +15941,16 @@ static bool isHalf2OrBFloat2(Type *Ty) {
return false;
}
+static bool isHalf2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
+}
+
+static bool isBFloat2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -16009,10 +16019,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
AS != AMDGPUAS::BUFFER_FAT_POINTER)
return AtomicExpansionKind::CmpXChg;
- // TODO: gfx940 supports v2f16 and v2bf16
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ // gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else {
+ // gfx90a, gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // gfx940, gfx12
+ // FIXME: Need to skip buffer_fat_pointer?
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ }
+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 4e21ef8379342..fff341b07881b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 3868f7bbc037a..533f0f2691045 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -4418,99 +4418,15 @@ define double @test_atomicrmw_fadd_f64_region_ret(ptr addrspace(2) %ptr, double
}
define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent(ptr %ptr, <2 x half> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x half> [[TMP5]]
-;
- %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret <2 x half> %res
-}
-
-define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
-;
- %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x half> [[TMP5]]
-;
- %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret <2 x half> %res
-}
-
-define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
-;
- %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) {
-; CI-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
-; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
; CI: atomicrmw.start:
; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4518,15 +4434,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr,
; CI: atomicrmw.end:
; CI-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX9: atomicrmw.start:
; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4534,15 +4450,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr,
; GFX9: atomicrmw.end:
; GFX9-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
-; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4550,15 +4466,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr,
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
-; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4566,19 +4482,19 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr,
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
-; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
; GFX940-NEXT: ret <2 x half> [[RES]]
;
-; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
-; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4586,20 +4502,20 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr,
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
- %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
ret <2 x half> %res
}
-define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) {
-; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
-; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
; CI: atomicrmw.start:
; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4607,15 +4523,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <
; CI: atomicrmw.end:
; CI-NEXT: ret void
;
-; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX9: atomicrmw.start:
; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4623,15 +4539,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <
; GFX9: atomicrmw.end:
; GFX9-NEXT: ret void
;
-; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
-; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4639,15 +4555,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
-; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
-; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4655,19 +4571,19 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
-; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
-; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
; GFX940-NEXT: ret void
;
-; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
-; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
@@ -4675,225 +4591,1339 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
- %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x bfloat> [[TMP5]]
-;
- %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
- ret <2 x bfloat> %res
-}
-
-define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
-;
- %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x bfloat> [[TMP5]]
-;
- %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
- ret <2 x bfloat> %res
-}
-
-define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
-;
- %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
ret void
}
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) {
-; CI-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
-; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
; CI: atomicrmw.start:
-; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CI: atomicrmw.end:
-; CI-NEXT: ret <2 x bfloat> [[TMP5]]
+; CI-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX9: atomicrmw.start:
-; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GFX9: atomicrmw.end:
-; GFX9-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX9-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
-; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
-; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GFX908: atomicrmw.end:
-; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
-; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
-; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
-; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX90A: atomicrmw.start:
-; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX90A: atomicrmw.end:
-; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX90A-NEXT: ret <2 x half> [[RES]]
;
-; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
-; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
-; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
;
-; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
-; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
-; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GFX11: atomicrmw.end:
-; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
- %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
- ret <2 x bfloat> %res
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret <2 x half> %res
}
-define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) {
-; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
-; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
; CI: atomicrmw.start:
-; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CI: atomicrmw.end:
; CI-NEXT: ret void
;
-; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
-; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX9: atomicrmw.start:
-; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GFX9: atomicrmw.end:
; GFX9-NEXT: ret void
;
-; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
-; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
-; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_local_agent(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret <2 x half> %res
+}
+
+define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret <2 x bfloat> %res
+}
+
+define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret <2 x bfloat> %res
+}
+
+define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret <2 x bfloat> %res
+}
+
+define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret <2 x half> %res
+}
+
+define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret <2 x half> %res
+}
+
+define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ ret <2 x bfloat> %res
+}
+
+define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4901,15 +5931,15 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr,
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
-; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
-; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4917,19 +5947,19 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr,
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
-; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
-; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
; GFX940-NEXT: ret void
;
-; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret(
-; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
@@ -4937,173 +5967,183 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr,
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
- %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+ %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret void
}
-define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x half> [[TMP5]]
-;
- %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret <2 x half> %res
-}
-
-define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 {
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x half> [[TMP5]]
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret <2 x half> %res
-}
-
-define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
- ret <2 x bfloat> %res
-}
-
-define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
;
- %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
- ret void
-}
-
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret <2 x bfloat> %res
}
define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 {
-; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
-; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
+; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret void
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
index 4f053af7aed93..26b84f82524e2 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
@@ -41,264 +41,1404 @@
;---------------------------------------------------------------------
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
@@ -310,7 +1450,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -848,12 +1988,8 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
-; GFX803: {{.*}}
-; GFX906: {{.*}}
-; GFX908: {{.*}}
-; GFX90A: {{.*}}
-; GFX940: {{.*}}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
index 0339e885ca770..d01cfbe035f64 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
@@ -41,264 +41,1404 @@
;---------------------------------------------------------------------
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
@@ -310,7 +1450,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -848,12 +1988,8 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
-; GFX803: {{.*}}
-; GFX906: {{.*}}
-; GFX908: {{.*}}
-; GFX90A: {{.*}}
-; GFX940: {{.*}}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
index 0fc6f1134ff40..3b0b7c9209588 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
@@ -41,264 +41,1260 @@
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
@@ -310,7 +1306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -848,12 +1844,10 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
-; GFX803: {{.*}}
-; GFX906: {{.*}}
-; GFX908: {{.*}}
-; GFX90A: {{.*}}
-; GFX940: {{.*}}
+;.
+; GFX90A: [[META0]] = !{}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
index 20f39b5ed0c86..3478f23e0b200 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
@@ -41,264 +41,1260 @@
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret <2 x half> [[RES]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
@@ -310,7 +1306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -848,12 +1844,10 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX12: {{.*}}
-; GFX803: {{.*}}
-; GFX906: {{.*}}
-; GFX908: {{.*}}
-; GFX90A: {{.*}}
-; GFX940: {{.*}}
+;.
+; GFX90A: [[META0]] = !{}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
index 1411890e01dc5..4556ad60ccf8c 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck -check-prefixes=CHECK,GFX900 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck -check-prefixes=CHECK,GFX940 %s
;---------------------------------------------------------------------
; atomicrmw fadd
@@ -70,44 +70,76 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(ptr addrspac
}
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
-; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
-; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK: atomicrmw.end:
-; CHECK-NEXT: ret <2 x half> [[TMP5]]
+; GFX900-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
+; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX900: atomicrmw.start:
+; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX900: atomicrmw.end:
+; GFX900-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX90A-NEXT: ret <2 x half> [[RES]]
+;
+; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
ret <2 x half> %res
}
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
-; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK: atomicrmw.end:
-; CHECK-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX900-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
+; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX900: atomicrmw.start:
+; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX900: atomicrmw.end:
+; GFX900-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
ret <2 x bfloat> %res
More information about the llvm-branch-commits
mailing list