[llvm] e14474a - AMDGPU/GlobalISel: Select llvm.amdgcn.global.atomic.fadd
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 12 07:05:03 PDT 2020
Author: Matt Arsenault
Date: 2020-08-12T10:04:53-04:00
New Revision: e14474a39a14b3c86c6c5d5ed9bf11467a0bbe9b
URL: https://github.com/llvm/llvm-project/commit/e14474a39a14b3c86c6c5d5ed9bf11467a0bbe9b
DIFF: https://github.com/llvm/llvm-project/commit/e14474a39a14b3c86c6c5d5ed9bf11467a0bbe9b.diff
LOG: AMDGPU/GlobalISel: Select llvm.amdgcn.global.atomic.fadd
Remove the intermediate transform in the DAG path. I believe this is
the last non-deprecated intrinsic that needs handling.
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/BUFInstructions.td
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ae90c55d7c07..eb41e56ff519 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4245,6 +4245,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_csub:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 2e30476fb258..45eca4b3216a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1099,7 +1099,7 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f99354b77272..3ee01d5bb93c 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -717,7 +717,7 @@ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
@@ -784,7 +784,7 @@ class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, $slc)
+ (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset, $slc)
>;
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
@@ -976,7 +976,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_g
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_fadd_global_noret, v2f16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ddb84b4e81f6..b744091c02b4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1140,6 +1140,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
->getPointerElementType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
+
+ // FIXME: Should report an atomic ordering here.
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
@@ -7521,21 +7523,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, VT,
M->getMemOperand());
}
-
- case Intrinsic::amdgcn_global_atomic_fadd: {
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2), // ptr
- Op.getOperand(3) // vdata
- };
-
- EVT VT = Op.getOperand(3).getValueType();
- auto *M = cast<MemSDNode>(Op);
-
- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
- DAG.getVTList(VT, MVT::Other), Ops,
- M->getMemOperand()).getValue(1);
- }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
@@ -8567,7 +8554,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
/// by the chain and intrinsic ID. Theoretically we would also need to check the
-/// specific intrinsic.
+/// specific intrinsic, but they all place the pointer operand first.
static unsigned getBasePtrIndex(const MemSDNode *N) {
switch (N->getOpcode()) {
case ISD::STORE:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index cee2d9453394..618b0a142ee9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -316,7 +316,7 @@ defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
} // End let AddressSpaces = ...
} // End foreach AddrSpace
-def atomic_fadd_global_noret : PatFrag<
+def atomic_fadd_global_noret_impl : PatFrag<
(ops node:$ptr, node:$value),
(atomic_load_fadd node:$ptr, node:$value)> {
// FIXME: Move this
@@ -325,14 +325,9 @@ def atomic_fadd_global_noret : PatFrag<
let AddressSpaces = StoreAddress_global.AddrSpaces;
}
-def atomic_pk_fadd_global_noret : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_load_fadd node:$ptr, node:$value)> {
- // FIXME: Move this
- let MemoryVT = v2f16;
- let IsAtomic = 1;
- let AddressSpaces = StoreAddress_global.AddrSpaces;
-}
+def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_global_atomic_fadd node:$src0, node:$src1),
+ (atomic_fadd_global_noret_impl node:$src0, node:$src1)]>;
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
new file mode 100644
index 000000000000..60ba088404a2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+
+define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) {
+; GFX908-LABEL: global_atomic_fadd_f32:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
+ ret void
+}
+
+define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %data) {
+; GFX908-LABEL: global_atomic_fadd_f32_off_2048:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_movk_i32 s4, 0x800
+; GFX908-NEXT: s_mov_b32 s5, 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s5
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr float, float addrspace(1)* %ptr, i64 512
+ call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
+ ret void
+}
+
+define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float %data) {
+; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b32 s4, 0xfffff804
+; GFX908-NEXT: s_mov_b32 s5, -1
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s5
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511
+ call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %ptr, float %data) {
+; GFX908-LABEL: global_atomic_fadd_f32_off_ss:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_add_u32 s0, s0, 0x800
+; GFX908-NEXT: s_addc_u32 s1, s1, 0
+; GFX908-NEXT: v_mov_b32_e32 v0, s0
+; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v2, s2
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_endpgm
+ %gep = getelementptr float, float addrspace(1)* %ptr, i64 512
+ call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
+ ret void
+}
+
+define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+; GFX908-LABEL: global_atomic_fadd_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+ ret void
+}
+
+define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b32 s4, 0xfffff804
+; GFX908-NEXT: s_mov_b32 s5, -1
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s5
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511
+ call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data)
+ ret void
+}
+
+declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0
+declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
+
+attributes #0 = { argmemonly nounwind willreturn }
More information about the llvm-commits
mailing list