[llvm-branch-commits] [llvm] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics (PR #97050)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jul 2 10:02:39 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/97050
>From 5fe43ecedbf2411d4ba6ff251a8a33d72900f264 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 27 Jun 2024 16:32:48 +0200
Subject: [PATCH] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics
These are now fully covered by atomicrmw.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 -
llvm/lib/IR/AutoUpgrade.cpp | 14 +-
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 2 -
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 -
.../Target/AMDGPU/AMDGPUSearchableTables.td | 2 -
llvm/lib/Target/AMDGPU/FLATInstructions.td | 2 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +-
llvm/test/Bitcode/amdgcn-atomic.ll | 22 ++
.../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 54 -----
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 100 ---------
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 193 ------------------
11 files changed, 33 insertions(+), 368 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 71b1e832bde3c..9cf4d6352d23d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2907,10 +2907,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
}
-// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-
defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5beefaa1ec701..8faaff5636665 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1034,7 +1034,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
- Name.starts_with("ds.fmax")) {
+ Name.starts_with("ds.fmax") ||
+ Name.starts_with("global.atomic.fadd.v2bf16") ||
+ Name.starts_with("flat.atomic.fadd.v2bf16")) {
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
// declaration.
NewFn = nullptr;
@@ -2352,7 +2354,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
.StartsWith("ds.fmin", AtomicRMWInst::FMin)
.StartsWith("ds.fmax", AtomicRMWInst::FMax)
.StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
- .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
+ .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap)
+ .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd)
+ .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd);
unsigned NumOperands = CI->getNumOperands();
if (NumOperands < 3) // Malformed bitcode.
@@ -2407,8 +2411,10 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
if (PtrTy->getAddressSpace() != 3) {
- RMW->setMetadata("amdgpu.no.fine.grained.memory",
- MDNode::get(F->getContext(), {}));
+ MDNode *EmptyMD = MDNode::get(F->getContext(), {});
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
+ if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy())
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
}
if (IsVolatile)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index c6dbc58395e48..db8b44149cf47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -620,12 +620,10 @@ multiclass local_addr_space_atomic_op {
defm int_amdgcn_flat_atomic_fadd : noret_op;
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
-defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
defm int_amdgcn_flat_atomic_fmin : noret_op;
defm int_amdgcn_flat_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
-defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9e7694f41d6b8..74686cd10512c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4897,8 +4897,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index a323f63767737..c25314ae25dc0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -250,8 +250,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 16dc019ede810..484d78959503d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1670,13 +1670,11 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd",
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 11227875144b9..c2e80603805db 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1346,9 +1346,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
+ case Intrinsic::amdgcn_atomic_cond_sub_u32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1451,14 +1449,12 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_flat_atomic_fadd:
- case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fadd:
- case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin:
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index a114c27bafd4a..9563d178e6433 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -300,4 +300,26 @@ define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float
ret float %result0
}
+declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr, <2 x i16>)
+
+define <2 x i16> @upgrade_amdgcn_flat_atomic_fadd_v2bf16_p0(ptr %ptr, <2 x i16> %data) {
+ ; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat>
+ ; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ ; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16>
+ ; CHECK-NEXT: ret <2 x i16> [[BC1]]
+ %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
+ ret <2 x i16> %result
+}
+
+declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1), <2 x i16>)
+
+define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %ptr, <2 x i16> %data) {
+ ; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat>
+ ; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+ ; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16>
+ ; CHECK-NEXT: ret <2 x i16> [[BC1]]
+ %result = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
+ ret <2 x i16> %result
+}
+
attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 710d48be037e0..4a6ad44742119 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -4,10 +4,6 @@
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
-; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
-declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
-
define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret:
; GFX940: ; %bb.0:
@@ -104,56 +100,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
ret <2 x half> %ret
}
-define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX940-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
-define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b32_e32 v1, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3]
-; GFX940-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset:
; GFX940: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 37a201e390f81..4bfdeb6f02a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -7,9 +7,7 @@ declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>,
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
-declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
-declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
@@ -59,104 +57,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
ret <2 x half> %ret
}
-define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX12-GISEL-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
-define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16:
; GFX12-SDAG: ; %bb.0: ; %main_body
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5f3cd81be77d5..b4032d8a3da6d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -4,10 +4,6 @@
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
-
-; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
-declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
@@ -183,97 +179,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
ret <2 x half> %ret
}
-define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX940-NEXT: s_endpgm
-;
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX12-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
-define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3]
-; GFX940-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
- ret void
-}
-
-define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
- ret <2 x i16> %ret
-}
-
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
; GFX940: ; %bb.0:
@@ -566,104 +471,6 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
ret void
}
-define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023
- %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data)
- ret <2 x i16> %result
-}
-
-define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256
- %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data)
- ret <2 x i16> %result
-}
-
-define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023
- %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data)
- ret void
-}
-
-define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) {
-; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256
- %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data)
- ret void
-}
-
attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
!0 = !{}
More information about the llvm-branch-commits
mailing list