[llvm-branch-commits] [llvm] AMDGPU: Remove global/flat atomic fadd intrinics (PR #97051)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jul 23 10:50:20 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/97051
>From 490241c40448f5b7fb61b78ee4d853da7c5fe42d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 27 Jun 2024 16:48:52 +0200
Subject: [PATCH] AMDGPU: Remove global/flat atomic fadd intrinics
These have been replaced with atomicrmw.
---
llvm/docs/ReleaseNotes.rst | 5 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 3 -
llvm/lib/IR/AutoUpgrade.cpp | 4 +-
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 5 -
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 -
.../Target/AMDGPU/AMDGPUSearchableTables.td | 2 -
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 -
llvm/lib/Target/AMDGPU/DSInstructions.td | 6 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 13 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 -
llvm/test/Bitcode/amdgcn-atomic.ll | 32 ++
.../AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll | 8 +-
.../AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll | 46 +-
.../GlobalISel/flat-atomic-fadd.v2f16.ll | 41 +-
.../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 55 +-
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 42 +-
.../global-atomic-fadd.f32-no-rtn.ll | 123 +---
.../GlobalISel/global-atomic-fadd.f32-rtn.ll | 183 ++----
.../GlobalISel/global-atomic-fadd.f64.ll | 167 ------
.../global-atomic-fadd.v2f16-no-rtn.ll | 114 +---
.../global-atomic-fadd.v2f16-rtn.ll | 72 +--
...llvm.amdgcn.global.atomic.fadd-with-ret.ll | 21 -
.../llvm.amdgcn.global.atomic.fadd.ll | 126 ----
.../AMDGPU/cgp-addressing-modes-gfx908.ll | 38 +-
.../CodeGen/AMDGPU/flat-atomic-fadd.f32.ll | 8 +-
.../CodeGen/AMDGPU/flat-atomic-fadd.f64.ll | 4 +-
.../CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll | 63 --
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 22 +-
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 62 +-
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 544 +++---------------
.../CodeGen/AMDGPU/gep-const-address-space.ll | 10 +-
.../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 115 ----
.../AMDGPU/global-atomic-fadd.f32-rtn.ll | 120 ----
.../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 171 ------
.../AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll | 58 +-
.../AMDGPU/global-atomic-fadd.v2f16-rtn.ll | 39 +-
.../AMDGPU/global-saddr-atomics.gfx908.ll | 23 +-
.../llvm.amdgcn.global.atomic.fadd.gfx90a.ll | 56 --
.../AMDGPU/llvm.amdgcn.global.atomic.fadd.ll | 77 ---
.../test/CodeGen/AMDGPU/shl_add_ptr_global.ll | 6 +-
.../CodeGen/AMDGPU/unsupported-atomics.ll | 28 -
.../AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll | 34 --
.../InferAddressSpaces/AMDGPU/flat_atomic.ll | 105 ++--
43 files changed, 468 insertions(+), 2191 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 46fa2a74450e1..e26af824cb6bb 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -68,6 +68,11 @@ Changes to the AArch64 Backend
Changes to the AMDGPU Backend
-----------------------------
+* Removed ``llvm.amdgcn.flat.atomic.fadd`` and
+ ``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
+ :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
+ addrspace(0) or addrspace(1) instead.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 119281ca6103a..1460933699b07 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2868,8 +2868,6 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
// gfx908 intrinsics
// ===----------------------------------------------------------------------===//
-def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-
// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -2908,7 +2906,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index f566a0e3c3043..0e95c6df670dc 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
Name.starts_with("ds.fmax") ||
- Name.starts_with("global.atomic.fadd.v2bf16") ||
- Name.starts_with("flat.atomic.fadd.v2bf16")) {
+ Name.starts_with("global.atomic.fadd") ||
+ Name.starts_with("flat.atomic.fadd")) {
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
// declaration.
NewFn = nullptr;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index db8b44149cf47..aa5b151adef3a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -618,16 +618,11 @@ multiclass local_addr_space_atomic_op {
}
}
-defm int_amdgcn_flat_atomic_fadd : noret_op;
-defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
defm int_amdgcn_flat_atomic_fmin : noret_op;
defm int_amdgcn_flat_atomic_fmax : noret_op;
-defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
-defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
-defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 546c0a238e430..310bdb04d74ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4887,13 +4887,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
- case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index c25314ae25dc0..ec853f49871f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -239,13 +239,11 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
-def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0b1ecc002ae25..a607f8286f9ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1018,7 +1018,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
switch (IID) {
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
- case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
@@ -1080,7 +1079,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
{NewV, MaskOp});
}
- case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 219246b71fe80..a47140d05a110 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1134,11 +1134,7 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value)),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
}
-
-def : DSAtomicRetPatIntrinsic<DS_ADD_RTN_F64, f64, int_amdgcn_flat_atomic_fadd_local_addrspace>;
-let AddedComplexity = 1 in
-def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret_local_addrspace>;
-}
+} // End SubtargetPredicate = HasLdsAtomicAddF64
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index a8740c474dce5..161b25acbce74 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1623,25 +1623,17 @@ let OtherPredicates = [isGFX12Only] in {
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
-defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
}
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
-defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
}
let OtherPredicates = [HasAtomicFaddRtnInsts] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
-defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
}
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
-defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
-defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}
@@ -1659,19 +1651,14 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax",
let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
}
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
}
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d34efe8f0bf5..8204893b61482 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1348,13 +1348,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MODereferenceable;
return true;
}
- case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
- case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
@@ -1461,13 +1459,11 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_csub:
- case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin:
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 9563d178e6433..d642372799f56 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -322,4 +322,36 @@ define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %
ret <2 x i16> %result
}
+declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr nocapture, <2 x half>) #0
+
+define <2 x half> @upgrade_amdgcn_flat_atomic_fadd_v2f16_p0_v2f16(ptr %ptr, <2 x half> %data) {
+ ; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
+ ret <2 x half> %result
+}
+
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0
+
+define <2 x half> @upgrade_amdgcn_global_atomic_fadd_v2f16_p1_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
+ ; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ %result = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ ret <2 x half> %result
+}
+
+declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr nocapture, float) #0
+
+define float @upgrade_amdgcn_flat_atomic_fadd_f32_p0_f32(ptr %ptr, float %data) {
+ ; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+ %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
+ ret float %result
+}
+
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
+
+define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr, float %data) {
+ ; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+ %result = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+ ret float %result
+}
+
attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index 820f9ee1ce7f4..7a97ac8211f67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX940-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX11-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
ret void
@@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
index b2a96fb948797..c1cb74cb0e25a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
@@ -1,46 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-
-define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
- ret double %ret
-}
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw
@@ -82,6 +42,4 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
ret double %ret
}
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
index f2f00823e0278..0896e4dc7af14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll
@@ -1,38 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s
-define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
- ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic
- ; GFX940: bb.1 (%ir-block.0):
- ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr)
- ; GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
- ret void
-}
-
-define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
- ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic
- ; GFX940: bb.1 (%ir-block.0):
- ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr)
- ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
- ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) #0 {
+define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn
; GFX940: bb.1 (%ir-block.0):
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -48,7 +17,7 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %da
ret <2 x half> %ret
}
-define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) #0 {
+define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) {
; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_rtn
; GFX940: bb.1 (%ir-block.0):
; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
@@ -65,7 +34,7 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2
ret <2 x half> %ret
}
-define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) #0 {
+define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) {
; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn
; GFX940: bb.1 (%ir-block.0):
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -80,7 +49,7 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data)
ret void
}
-define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) #0 {
+define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) {
; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_no_rtn
; GFX940: bb.1 (%ir-block.0):
; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
@@ -96,6 +65,4 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x
ret void
}
-attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 031a3633bd375..25c7fc6463c33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -1,22 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
-
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
-
-define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
-; GFX940-LABEL: flat_atomic_fadd_f32_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
-; GFX940-NEXT: s_endpgm
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
- ret void
-}
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefix=GFX940
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
@@ -50,17 +33,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
ret void
}
-define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
-; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
- ret float %ret
-}
-
define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
; GFX940: ; %bb.0:
@@ -75,31 +47,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
ret float %ret
}
-define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2f16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
-; GFX940-NEXT: s_endpgm
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
- ret void
-}
-
-define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset:
; GFX940: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index cfd4e1211bb01..44c39320bfd97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1020,20 +1020,47 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[0:1], exec
+; GFX90A-NEXT: s_mov_b32 s4, s1
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB36_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: .LBB36_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[0:1], exec
+; GFX940-NEXT: s_mov_b32 s4, s1
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB36_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB36_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1296,13 +1323,16 @@ define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_atomic_fadd_f64_rtn:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1656,6 +1686,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
@@ -1664,7 +1696,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
@@ -1677,13 +1712,16 @@ define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_atomic_fadd_f64_rtn:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 3f0b86c271538..5eb8a23b1e467 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -1,117 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
- ; GFX908_GFX11: bb.1 (%ir-block.0):
- ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX908_GFX11-NEXT: {{ $}}
- ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
- ; GFX908_GFX11: bb.1 (%ir-block.0):
- ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX908_GFX11-NEXT: {{ $}}
- ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
- ; GFX908_GFX11: bb.1 (%ir-block.0):
- ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX908_GFX11-NEXT: {{ $}}
- ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
- ; GFX908_GFX11: bb.1 (%ir-block.0):
- ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX908_GFX11-NEXT: {{ $}}
- ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret void
-}
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) #0 {
; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
@@ -217,7 +109,4 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
ret void
}
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-
-attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index 2dc40c38343a9..15e6bcfa6f84a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -1,137 +1,53 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
- ; GFX11: bb.1 (%ir-block.0):
- ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
- ; GFX11: bb.1 (%ir-block.0):
- ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
- ; GFX11: bb.1 (%ir-block.0):
- ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
+ ; GFX90A-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
+ ; GFX90A: bb.1 (%ir-block.0):
+ ; GFX90A-NEXT: successors: %bb.2(0x80000000)
+ ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s32) from %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: bb.2.atomicrmw.start:
+ ; GFX90A-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI %11, %bb.2, [[S_MOV_B]], %bb.1
+ ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_DWORD]], %bb.1, %18, %bb.2
+ ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_F32_e64_]], %subreg.sub0, [[PHI1]], %subreg.sub1
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[GLOBAL_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec
+ ; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64_xexec = SI_IF_BREAK [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def $scc
+ ; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A-NEXT: S_BRANCH %bb.3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: bb.3.atomicrmw.end:
+ ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_CMPSWAP_RTN]], %bb.2
+ ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64_xexec = PHI [[SI_IF_BREAK]], %bb.2
+ ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A-NEXT: $vgpr0 = COPY [[PHI2]]
+ ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
- ; GFX11: bb.1 (%ir-block.0):
- ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) #0 {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
+ ; GFX940: bb.1 (%ir-block.0):
+ ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX940-NEXT: {{ $}}
+ ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
+ ; GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
+ ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
; GFX11: bb.1 (%ir-block.0):
@@ -426,7 +342,4 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
ret float %ret
}
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-
-attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
index 5ea8f7370921a..813c230632476 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
@@ -2,170 +2,6 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
; GFX90A: bb.1 (%ir-block.0):
@@ -319,7 +155,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
ret double %ret
}
-declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
index ace66e6a234aa..db508b5aea8c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
@@ -3,115 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
- ; GFX908: bb.1 (%ir-block.0):
- ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX908-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
- ; GFX908: bb.1 (%ir-block.0):
- ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX908-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
- ; GFX908: bb.1 (%ir-block.0):
- ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX908-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
- ; GFX908: bb.1 (%ir-block.0):
- ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX908-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 {
+define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn
; GFX908: bb.1 (%ir-block.0):
; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -137,7 +29,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2
ret void
}
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 {
+define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
; GFX908: bb.1 (%ir-block.0):
; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
@@ -165,6 +57,4 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) in
ret void
}
-attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll
index e895d6f18c979..f11196be89bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll
@@ -2,73 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 {
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn
; GFX90A_GFX940: bb.1 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -84,7 +18,7 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr,
ret <2 x half> %ret
}
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 {
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn
; GFX90A_GFX940: bb.1 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
@@ -101,6 +35,4 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1)
ret <2 x half> %ret
}
-attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
deleted file mode 100644
index 9a66fe10ccdf3..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: not --crash llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
-
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>)
-
-; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd)
-
-; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
-define float @global_atomic_fadd_f32_rtn(ptr addrspace(1) %ptr, float %data) {
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_fadd_v2f16_rtn:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
-define <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
deleted file mode 100644
index de91c45000f13..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
+++ /dev/null
@@ -1,126 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-
-define void @global_atomic_fadd_f32(ptr addrspace(1) %ptr, float %data) {
-; GFX908-LABEL: global_atomic_fadd_f32:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: global_atomic_fadd_f32:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-define void @global_atomic_fadd_f32_off_2048(ptr addrspace(1) %ptr, float %data) {
-; GFX908-LABEL: global_atomic_fadd_f32_off_2048:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: global_atomic_fadd_f32_off_2048:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data)
- ret void
-}
-
-define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %data) {
-; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: global_atomic_fadd_f32_off_neg2047:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -511
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) {
-; GFX908-LABEL: global_atomic_fadd_f32_off_ss:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX908-NEXT: v_mov_b32_e32 v1, 0
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, s2
-; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048
-; GFX908-NEXT: s_endpgm
-;
-; GFX90A-LABEL: global_atomic_fadd_f32_off_ss:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048
-; GFX90A-NEXT: s_endpgm
- %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data)
- ret void
-}
-
-define void @global_atomic_fadd_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
-; GFX908-LABEL: global_atomic_fadd_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: global_atomic_fadd_v2f16:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-define void @global_atomic_fadd_v2f16_off_neg2047(ptr addrspace(1) %ptr, <2 x half> %data) {
-; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: global_atomic_fadd_v2f16_off_neg2047:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -511
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %gep, <2 x half> %data)
- ret void
-}
-
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 21e2a85ab18d9..7587b81e9936d 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -1,26 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
; Make sure we match the addressing mode offset of globla.atomic.fadd intrinsics across blocks.
define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
-; OPT-NEXT: entry:
-; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
+; OPT-LABEL: define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[ENTRY:.*]]:
+; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR2:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
-; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
-; OPT: if:
-; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN:%.*]], i32 7
-; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[IN_GEP]], float 2.000000e+00)
+; OPT-NEXT: br i1 [[CMP]], label %[[ENDIF:.*]], label %[[IF:.*]]
+; OPT: [[IF]]:
+; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN]], i32 7
+; OPT-NEXT: [[FADD2:%.*]] = atomicrmw fadd ptr addrspace(1) [[IN_GEP]], float 2.000000e+00 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
; OPT-NEXT: [[VAL:%.*]] = load volatile float, ptr addrspace(1) undef, align 4
-; OPT-NEXT: br label [[ENDIF]]
-; OPT: endif:
-; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT:%.*]], i32 999999
+; OPT-NEXT: br label %[[ENDIF]]
+; OPT: [[ENDIF]]:
+; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], %[[IF]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT]], i32 999999
; OPT-NEXT: store float [[X]], ptr addrspace(1) [[OUT_GEP]], align 4
-; OPT-NEXT: br label [[DONE:%.*]]
-; OPT: done:
+; OPT-NEXT: br label %[[DONE:.*]]
+; OPT: [[DONE]]:
; OPT-NEXT: ret void
;
; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32:
@@ -36,6 +37,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: global_load_dword v0, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: .LBB0_2: ; %endif
@@ -51,7 +54,7 @@ entry:
if:
%in.gep = getelementptr float, ptr addrspace(1) %in, i32 7
- %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %in.gep, float 2.0)
+ %fadd2 = atomicrmw fadd ptr addrspace(1) %in.gep, float 2.000000e+00 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
%val = load volatile float, ptr addrspace(1) undef
br label %endif
@@ -71,3 +74,8 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapt
attributes #0 = { argmemonly nounwind }
attributes #1 = { nounwind readnone willreturn }
attributes #2 = { argmemonly nounwind willreturn }
+
+!0 = !{}
+;.
+; OPT: [[META0]] = !{}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
index 3e9e5056f87d6..ef180cef7ed2a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX940-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -26,7 +26,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX11-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
ret void
@@ -42,7 +42,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
;
@@ -55,7 +55,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+ ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index 9f086440cefa2..736de59cf8dbd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -17,7 +17,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %d
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
+ ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
ret void
@@ -36,7 +36,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
+ ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr)
; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
deleted file mode 100644
index 647c5b568b7ad..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX12 %s
-
-define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
- ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic
- ; GFX940: bb.0 (%ir-block.0):
- ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
- ; GFX940-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic
- ; GFX12: bb.0 (%ir-block.0):
- ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
- ; GFX12-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
- ret void
-}
-
-define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
- ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic
- ; GFX940: bb.0 (%ir-block.0):
- ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
- ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
- ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- ;
- ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic
- ; GFX12: bb.0 (%ir-block.0):
- ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
- ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 05259b4f51310..f3b75298b89d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-SDAG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
@@ -17,6 +17,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
@@ -26,6 +28,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret void
@@ -39,8 +43,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn:
@@ -50,8 +56,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret <2 x half> %ret
@@ -65,7 +73,10 @@ define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16:
@@ -75,7 +86,10 @@ define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
@@ -90,8 +104,10 @@ define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x ha
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn:
@@ -101,8 +117,10 @@ define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x ha
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5322a283d3de4..6e1787faff942 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s -check-prefix=GFX940
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
@@ -15,7 +15,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret:
@@ -25,6 +28,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
ret void
@@ -90,8 +95,10 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_rtn:
@@ -101,8 +108,10 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
ret float %ret
@@ -145,7 +154,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
@@ -155,6 +167,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret void
@@ -164,8 +178,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn:
@@ -175,8 +191,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret <2 x half> %ret
@@ -286,8 +304,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %dat
; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset:
@@ -297,8 +317,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %dat
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 1023
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data)
@@ -312,8 +334,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %dat
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset:
@@ -323,8 +347,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %dat
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -256
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data)
@@ -335,8 +361,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %da
; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset:
@@ -346,8 +374,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %da
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092
-; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 1023
%unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data)
@@ -361,8 +391,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %da
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset:
@@ -372,8 +404,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %da
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024
-; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -256
%unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data)
@@ -384,8 +418,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2
; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset:
@@ -395,8 +431,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i64 1023
%result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data)
@@ -410,8 +448,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset:
@@ -421,8 +461,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i64 -256
%result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data)
@@ -433,8 +475,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x ha
; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset:
@@ -444,8 +488,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092
-; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i64 1023
%unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data)
@@ -459,8 +505,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset:
@@ -470,8 +518,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024
-; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i64 -256
%unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 2615147b488d4..957c10ddf85e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX940
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
@@ -14,11 +14,8 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
-declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
@@ -1019,31 +1016,6 @@ main_body:
ret void
}
-define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
-; GFX90A-LABEL: global_atomic_fadd_f64_noret:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s6
-; GFX90A-NEXT: v_mov_b32_e32 v1, s7
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: global_atomic_fadd_f64_noret:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s6
-; GFX940-NEXT: v_mov_b32_e32 v1, s7
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
@@ -1097,47 +1069,28 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB39_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: .LBB39_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB39_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: .LBB39_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1147,45 +1100,26 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB40_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: .LBB40_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB40_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: .LBB40_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1195,47 +1129,28 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB41_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: .LBB41_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB41_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: .LBB41_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1245,70 +1160,32 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB42_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: .LBB42_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB42_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: .LBB42_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) {
-; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: global_atomic_fadd_f64_rtn:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
; GFX90A: ; %bb.0: ; %main_body
@@ -1429,57 +1306,37 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[4:5], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB49_3
-; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
-; GFX90A-NEXT: .LBB49_3:
+; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB49_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: .LBB49_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1658,52 +1515,6 @@ main_body:
ret double %ret
}
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
-; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: flat_atomic_fadd_f64_noret:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
- ret void
-}
-
-define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) {
-; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: flat_atomic_fadd_f64_rtn:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
- ret double %ret
-}
-
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
@@ -1713,7 +1524,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
@@ -1724,7 +1535,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
;
@@ -1839,85 +1650,16 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
; GFX90A-LABEL: local_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB63_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c
-; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB63_2:
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: local_atomic_fadd_f64_noret:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB63_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c
-; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-NEXT: ds_add_f64 v2, v[0:1]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB63_2:
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
- ret void
-}
-
-define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
- ret double %ret
-}
-
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic:
-; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_endpgm
;
-; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic:
+; GFX940-LABEL: local_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
@@ -1925,14 +1667,15 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr a
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_endpgm
main_body:
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
+ %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
ret void
}
-define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
-; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
+define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
+; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
@@ -1941,7 +1684,7 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %p
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
+; GFX940-LABEL: local_atomic_fadd_f64_rtn:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v3, v2
@@ -1950,49 +1693,30 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %p
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
+ %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
ret double %ret
}
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB67_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB67_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB67_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
+; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB67_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2002,42 +1726,23 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB68_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB68_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB68_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
+; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB68_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2047,42 +1752,23 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[0:1], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB69_2
-; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB69_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_mov_b64 s[0:1], exec
-; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB69_2
-; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
+; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB69_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2157,92 +1843,6 @@ main_body:
ret double %ret
}
-define double @flat_atomic_fadd_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 {
-; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 511
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
- ret double %ret
-}
-
-define double @flat_atomic_fadd_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 {
-; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 -511
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data)
- ret double %ret
-}
-
-define void @flat_atomic_fadd_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 {
-; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 511
- %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
- ret void
-}
-
-define void @flat_atomic_fadd_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 {
-; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 -511
- %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data)
- ret void
-}
-
define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 {
; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset:
; GFX90A: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
index d70d45d44af0f..c8bbafbfa44d8 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -1,7 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
-
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s
define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) {
; CHECK-LABEL: IllegalGEPConst:
@@ -17,14 +15,16 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_addc_u32 s1, s5, s1
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
- %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8
+ %i.5 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}
-attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" }
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index c1b333c692749..ad3f920eadc91 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -5,118 +5,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
- ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
- ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX908_GFX11_GFX12-NEXT: {{ $}}
- ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
- ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
- ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX908_GFX11_GFX12-NEXT: {{ $}}
- ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
- ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
- ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX908_GFX11_GFX12-NEXT: {{ $}}
- ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
- ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
- ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX908_GFX11_GFX12-NEXT: {{ $}}
- ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
- ;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret void
-}
-
define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
@@ -324,7 +212,4 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
ret void
}
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 1b955eae89159..3951e02d46a8f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -4,126 +4,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
- ; GFX11: bb.0 (%ir-block.0):
- ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
- ; GFX11: bb.0 (%ir-block.0):
- ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
- ; GFX11: bb.0 (%ir-block.0):
- ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- ;
- ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
- ; GFX11: bb.0 (%ir-block.0):
- ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
- ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
- ret float %ret
-}
-
define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index 50bd5bf57dd9f..ba94a53dff03b 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -2,174 +2,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
- %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret void
-}
-
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
- ; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
- ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
- ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
- %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
- ret double %ret
-}
-
define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -299,7 +131,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
ret double %ret
}
-declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
-
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
index 60345c0b44339..02e425e6d10a8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
@@ -1,13 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
+
+define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
+ ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX908-NEXT: {{ $}}
@@ -16,10 +17,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
@@ -28,14 +29,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
+define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
+ ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX908-NEXT: {{ $}}
@@ -44,10 +45,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
@@ -56,14 +57,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
+define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) {
+ ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX908-NEXT: {{ $}}
@@ -72,10 +73,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
@@ -84,14 +85,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
+define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
+ ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX908-NEXT: {{ $}}
@@ -100,10 +101,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
@@ -112,11 +113,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
index c8caf1fe365b1..794a52b6900ea 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
@@ -14,15 +14,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspac
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
@@ -31,15 +31,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr ad
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) {
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
@@ -48,15 +48,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr add
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
- ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
+ ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
@@ -65,12 +65,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(p
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
+ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
- %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
index 0147084a6996f..84065c4675ab1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck --check-prefix=GCN %s
; Test using saddr addressing mode of global_* flat atomic instructions.
@@ -11,10 +11,12 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(ptr addrspace(1) inreg %sbase
; GCN-LABEL: global_fadd_saddr_f32_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep0, float %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %gep0, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
@@ -22,11 +24,13 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(ptr addrspace(1) inreg
; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:-128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep1, float %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %gep1, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
@@ -34,10 +38,12 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(ptr addrspace(1) inreg %sba
; GCN-LABEL: global_fadd_saddr_v2f16_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep0, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %gep0, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -45,15 +51,14 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(ptr addrspace(1) inr
; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] offset:-128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep1, <2 x half> %data)
+ %ret = atomicrmw fadd ptr addrspace(1) %gep1, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) nocapture, float) #0
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) nocapture, <2 x half>) #0
-
-attributes #0 = { argmemonly nounwind willreturn }
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll
deleted file mode 100644
index af84105747189..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
-
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-
-; GFX90A-LABEL: {{^}}global_atomic_add_f32:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
-define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc
-define amdgpu_ps float @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc
-define amdgpu_ps float @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret <2 x half> %ret
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll
deleted file mode 100644
index 0c3ce3308dd8f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN
-
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float)
-
-; GCN-LABEL: {{^}}global_atomic_add_f32:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_add_f32_off4:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-define amdgpu_kernel void @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4
-define amdgpu_kernel void @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16:
-; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
-; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
-; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}}
-define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret void
-}
-
-; Make sure this artificially selects with an incorrect subtarget, but
-; the feature set.
-; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(ptr addrspace(1) %ptr, float %data) #0 {
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}flat_atomic_fadd_f32_wrong_subtarget:
-; GCN: flat_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(ptr %ptr, float %data) #1 {
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
- ret void
-}
-
-attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts"}
-attributes #1 = { "target-cpu"="gfx803" "target-features"="+flat-atomic-fadd-f32-inst"}
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index 39541537b3647..8ea83da78f889 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -40,12 +40,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr a
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to ptr addrspace(1)
- call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %castback, float 100.0)
+ %unused = atomicrmw fadd ptr addrspace(1) %castback, float 100.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret void
}
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #1
-
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll b/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll
deleted file mode 100644
index 4b84f9175307f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX1030 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s
-
-; GFX906: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.atomic.fadd
-
-; GFX908-LABEL: fadd_test:
-; GFX908: global_atomic_add_f32
-
-; GFX90A-LABEL: fadd_test:
-; GFX90A: global_atomic_add_f32
-
-; GFX940-LABEL: fadd_test:
-; GFX940: global_atomic_add_f32
-
-; GFX1030: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.atomic.fadd
-
-; GFX1100-LABEL: fadd_test:
-; GFX1100: global_atomic_add_f32
-
-define fastcc void @fadd_test(ptr addrspace(1) nocapture noundef %0, float noundef %1) unnamed_addr {
- %3 = tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) noundef %0, float noundef %1)
- ret void
-}
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float)
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll
index b05caa440ddcb..3d529a2c6ef69 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll
@@ -1,20 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
declare float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %ptr, float %data)
declare float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %ptr, float %data)
define amdgpu_kernel void @flat_atomic_fadd_f32_p1(ptr addrspace(1) %ptr, float %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p1
; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], float [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(1) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -24,13 +21,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p2(ptr addrspace(2) %ptr, float
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p2
; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], float [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(2) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -40,13 +35,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p3
; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], float [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(3) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -55,13 +48,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float
define amdgpu_kernel void @flat_atomic_fadd_f32_p4(ptr addrspace(4) %ptr, float %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p4
; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], float [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(4) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -71,13 +62,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p5
; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], float [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(5) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -86,13 +75,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float
define amdgpu_kernel void @flat_atomic_fadd_f32_p6(ptr addrspace(6) %ptr, float %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p6
; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], float [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(6) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -102,13 +89,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p7
; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], float [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(7) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
@@ -117,32 +102,27 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float
define amdgpu_kernel void @flat_atomic_fadd_f32_p99(ptr addrspace(99) %ptr, float %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p99
; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], float [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(99) %ptr to ptr
- %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data)
%max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data)
%min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data)
ret void
}
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
define amdgpu_kernel void @flat_atomic_fadd_f64_p1(ptr addrspace(1) %ptr, double %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p1
; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], double [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(1) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -152,13 +132,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p2(ptr addrspace(2) %ptr, double
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p2
; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], double [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(2) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -168,13 +146,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p3
; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], double [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(3) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -183,13 +159,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double
define amdgpu_kernel void @flat_atomic_fadd_f64_p4(ptr addrspace(4) %ptr, double %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p4
; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], double [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(4) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -199,13 +173,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p5
; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], double [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(5) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -214,13 +186,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double
define amdgpu_kernel void @flat_atomic_fadd_f64_p6(ptr addrspace(6) %ptr, double %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p6
; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], double [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(6) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -230,13 +200,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p7
; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], double [[DATA:%.*]]) {
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(7) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
@@ -245,13 +213,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double
define amdgpu_kernel void @flat_atomic_fadd_f64_p99(ptr addrspace(99) %ptr, double %data) {
; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p99
; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], double [[DATA:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]])
; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]])
; CHECK-NEXT: ret void
;
%cast = addrspacecast ptr addrspace(99) %ptr to ptr
- %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data)
%max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data)
%min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data)
ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index d9c3c4b17090b..57e6fdb35113e 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -1,9 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8
-declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #8
-declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #8
+declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0
+declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0
define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-LABEL: InferNothing:
@@ -21,37 +20,49 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr %b, i64 %i.2
- %i.4 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.3, double %c) #8
+ %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}
-
define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) {
; CHECK-LABEL: InferFadd:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], exec
+; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_cbranch_execz .LBB1_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_load_dword s8, s[2:3], 0x24
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_ashr_i32 s1, s0, 31
-; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
-; CHECK-NEXT: s_add_u32 s0, s4, s0
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
-; CHECK-NEXT: v_mov_b32_e32 v1, s7
-; CHECK-NEXT: s_addc_u32 s1, s5, s1
-; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8
+; CHECK-NEXT: s_ashr_i32 s9, s8, 31
+; CHECK-NEXT: s_lshl_b64 s[2:3], s[8:9], 3
+; CHECK-NEXT: s_add_u32 s2, s4, s2
+; CHECK-NEXT: s_addc_u32 s3, s5, s3
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
+; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
- %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8
+ %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -75,7 +86,7 @@ entry:
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
- %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #8
+ %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #1
ret void
}
@@ -99,27 +110,43 @@ entry:
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
- %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #8
+ %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #1
ret void
}
define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) {
; CHECK-LABEL: InferMixed:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], exec
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_ashr_i32 s1, s0, 31
-; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; CHECK-NEXT: v_mov_b32_e32 v0, s8
; CHECK-NEXT: v_mov_b32_e32 v1, s9
-; CHECK-NEXT: s_add_u32 s0, s4, s0
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; CHECK-NEXT: s_addc_u32 s1, s5, s1
; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
-; CHECK-NEXT: global_atomic_add_f64 v4, v[2:3], s[0:1] offset:-7
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; CHECK-NEXT: s_cbranch_execz .LBB4_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_load_dword s2, s[2:3], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_ashr_i32 s3, s2, 31
+; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; CHECK-NEXT: s_add_u32 s2, s4, s2
+; CHECK-NEXT: s_addc_u32 s3, s5, s3
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
+; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-7
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
@@ -127,13 +154,13 @@ entry:
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
br label %bb1
-bb1:
+bb1: ; preds = %entry
%i.7 = ptrtoint ptr addrspace(1) %i.3 to i64
%i.8 = add nsw i64 %i.7, 1
%i.9 = inttoptr i64 %i.8 to ptr addrspace(1)
- %i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %d, double %c) #23
+ %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
%i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr
- %i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.11, double %c) #23
+ %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -158,10 +185,21 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %bb1
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
-; CHECK-NEXT: v_mov_b32_e32 v1, s7
+; CHECK-NEXT: s_mov_b64 s[0:1], exec
+; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_cbranch_execz .LBB5_4
+; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: .LBB5_4:
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
@@ -170,7 +208,7 @@ entry:
%i.4 = ptrtoint ptr addrspace(1) %i.3 to i64
br label %bb0
-bb0:
+bb0: ; preds = %bb0, %entry
%phi = phi ptr addrspace(1) [ %i.3, %entry ], [ %i.9, %bb0 ]
%i.7 = ptrtoint ptr addrspace(1) %phi to i64
%i.8 = sub nsw i64 %i.7, 1
@@ -178,12 +216,13 @@ bb0:
%i.9 = inttoptr i64 %i.7 to ptr addrspace(1)
br i1 %cmp2, label %bb1, label %bb0
-bb1:
+bb1: ; preds = %bb0
%i.10 = addrspacecast ptr addrspace(1) %i.9 to ptr
- %i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.10, double %c) #23
+ %0 = atomicrmw fadd ptr %i.10, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" }
-
+!0 = !{}
More information about the llvm-branch-commits
mailing list