[llvm] ae72fee - [AMDGPU] gfx11 Select on Buffer Atomic FAdd Rtn type
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 23 08:35:13 PDT 2022
Author: Joe Nash
Date: 2022-06-23T11:05:32-04:00
New Revision: ae72fee74ece594b8f30729b325a1f1cc1533d36
URL: https://github.com/llvm/llvm-project/commit/ae72fee74ece594b8f30729b325a1f1cc1533d36
DIFF: https://github.com/llvm/llvm-project/commit/ae72fee74ece594b8f30729b325a1f1cc1533d36.diff
LOG: [AMDGPU] gfx11 Select on Buffer Atomic FAdd Rtn type
Reviewed By: #amdgpu, foad, rampitec
Differential Revision: https://reviews.llvm.org/D128205
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index cabdc6998011..1bbdc39a7a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -7,8 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUGlobalISelUtils.h"
+#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/IR/Constants.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
using namespace llvm;
using namespace MIPatternMatch;
@@ -66,3 +68,12 @@ bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
return true;
return (Mask[0] & 2) == (Mask[1] & 2);
}
+
+bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget,
+ const LLT &Ty) {
+ if (Ty == LLT::scalar(32))
+ return Subtarget.hasAtomicFaddRtnInsts();
+ if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64))
+ return Subtarget.hasGFX90AInsts();
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 14d3a3fb7997..5c600d059b7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -16,6 +16,8 @@
namespace llvm {
class MachineRegisterInfo;
+class GCNSubtarget;
+class LLT;
namespace AMDGPU {
@@ -24,7 +26,7 @@ std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
-
+bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8fe735c1fcd1..76a39d011569 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2994,13 +2994,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
- if (STI.hasGFX90AInsts())
+ const Register DefReg = MI.getOperand(0).getReg();
+ LLT DefTy = MRI->getType(DefReg);
+ if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy))
return selectImpl(MI, *CoverageInfo);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+ if (!MRI->use_nodbg_empty(DefReg)) {
Function &F = MBB->getParent()->getFunction();
DiagnosticInfoUnsupported
NoFpRet(F, "return versions of fp atomics not supported",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 17a2d729788e..01a3e78ea48c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5738,7 +5738,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
Register DstReg = MI.getOperand(0).getReg();
- if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
+ if (!MRI.use_empty(DstReg) &&
+ !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
Function &F = B.getMF().getFunction();
DiagnosticInfoUnsupported NoFpRet(
F, "return versions of fp atomics not supported", B.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 887a086d421d..e64d5e028d0a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4362,6 +4362,18 @@ bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return isTypeLegal(VT.getScalarType());
}
+bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
+ switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
+ case MVT::f32:
+ return Subtarget->hasAtomicFaddRtnInsts();
+ case MVT::v2f16:
+ case MVT::f64:
+ return Subtarget->hasGFX90AInsts();
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
// This currently forces unfolding various combinations of fsub into fma with
// free fneg'd operands. As long as we have fast FMA (controlled by
@@ -7399,7 +7411,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
- if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+ if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
@@ -12623,7 +12635,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
- Subtarget->hasAtomicFaddInsts()) {
+ Subtarget->hasAtomicFaddNoRtnInsts()) {
if (Subtarget->hasGFX940Insts())
return AtomicExpansionKind::None;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 16f944fc641c..4fbccf0c5850 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -394,6 +394,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineBasicBlock *BB) const override;
bool hasBitPreservingFPLogic(EVT VT) const override;
+ bool hasAtomicFaddRtnForTy(SDValue &Op) const;
bool enableAggressiveFMAFusion(EVT VT) const override;
bool enableAggressiveFMAFusion(LLT Ty) const override;
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll
new file mode 100644
index 000000000000..f05bd673bad5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.rtn_no-rtn.ll
@@ -0,0 +1,99 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+; no-rtn
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
+define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ %voffset.add = add i32 %voffset, 4095
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
+define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET
+define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
+ ret void
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
+define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
+ ret void
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
+define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
+ ret void
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
+define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ %voffset.add = add i32 %voffset, 4095
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
+define amdgpu_ps void @xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+
+; rtn
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
+define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ %voffset.add = add i32 %voffset, 4095
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
+define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET
+define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
+define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
+define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
+define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ %voffset.add = add i32 %voffset, 4095
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
+define amdgpu_ps float @xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
+declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list