[llvm] [AMDGPU] Introduce ordering parameter to atomic intrinsics and introduce new llvm.amdgcn.image.atomic.load intrinsic. (PR #73613)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 25 02:11:14 PDT 2024
https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/73613
>From 48b316d1c7dc3156c374a14baa05fbae5350c5de Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <Stefan.Stipanovic at amd.com>
Date: Fri, 22 Dec 2023 14:46:19 +0100
Subject: [PATCH] [AMDGPU] Introduce llvm.amdgcn.image.atomic.load intrinsic.
This intrinsic should behave mostly identically to an llvm.amdgcn.image.load,
except that: - It is not marked as IntrReadMem. This is to ensure that
the implied memory semantics are preserved. - When lowering, it's
MachineMemOperand is to get the "acquire" memory semantics.
Change-Id: Id1cf412a031a6cec6f606674754c2d5609926be0
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 9 +-
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 +-
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 6 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 10 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 6 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +
llvm/lib/Target/AMDGPU/MIMGInstructions.td | 29 ++
llvm/lib/Target/AMDGPU/SIDefines.h | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 10 +-
llvm/test/CodeGen/AMDGPU/atomic-image-load.ll | 256 ++++++++++++++++++
.../CodeGen/AMDGPU/force-store-sc0-sc1.ll | 13 +-
13 files changed, 344 insertions(+), 26 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/atomic-image-load.ll
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 7bb12d8f065c9d..b1259a13bfd8df 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -36,6 +36,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/RecyclingAllocator.h"
@@ -1299,7 +1300,8 @@ class SelectionDAG {
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic);
inline SDValue getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
@@ -1307,11 +1309,12 @@ class SelectionDAG {
MaybeAlign Alignment = std::nullopt,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic) {
// Ensure that codegen never sees alignment 0
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
Alignment.value_or(getEVTAlign(MemVT)), Flags,
- Size, AAInfo);
+ Size, AAInfo, Ordering);
}
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 612433b54f6e44..790e830772915d 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1160,6 +1160,8 @@ class TargetLoweringBase {
MaybeAlign align = Align(1); // alignment
MachineMemOperand::Flags flags = MachineMemOperand::MONone;
+
+ AtomicOrdering ordering = AtomicOrdering::NotAtomic;
IntrinsicInfo() = default;
};
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0f29653f1f5bec..3b3f26c05c7475 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -848,7 +848,8 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
// All dimension-aware intrinsics are derived from this class.
class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
list<IntrinsicProperty> props,
- list<SDNodeProperty> sdnodeprops> : Intrinsic<
+ list<SDNodeProperty> sdnodeprops,
+ string name = ""> : Intrinsic<
P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return
!listconcat(
!foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
@@ -866,7 +867,7 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
!if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
[ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
- !if(P_.IsAtomic, [], [IntrNoSync])),
+ !if(!or(P_.IsAtomic, !eq(name, "int_amdgcn_image_atomic_load")), [], [IntrNoSync])),
"", sdnodeprops>,
@@ -912,7 +913,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
def !strconcat(NAME, "_", dim.Name)
: AMDGPUImageDimIntrinsic<
AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
- props, sdnodeprops>;
+ props, sdnodeprops, NAME>;
}
}
@@ -920,6 +921,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
[SDNPMemOperand]>,
AMDGPUImageDMaskIntrinsic;
+ defm int_amdgcn_image_atomic_load
+ : AMDGPUImageDimIntrinsicsAll<"ATOMIC_LOAD", [llvm_any_ty], [], [],
+ [SDNPMemOperand]>,
+ AMDGPUImageDMaskIntrinsic;
defm int_amdgcn_image_load_mip
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
[IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 7c95cef2eeb761..cada7f451cd544 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2652,8 +2652,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- MIB.addMemOperand(
- MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
+ MIB.addMemOperand(MF->getMachineMemOperand(
+ MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
+ /*Ranges*/ nullptr, /*SSID*/ SyncScope::System, Info.ordering,
+ Info.ordering));
}
return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index add92cf8b31e44..5adf46d7a693bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -55,8 +55,10 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -8367,15 +8369,17 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
SDValue SelectionDAG::getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
- MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
+ MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo,
+ AtomicOrdering Ordering) {
if (!Size && MemVT.isScalableVector())
Size = MemoryLocation::UnknownSize;
else if (!Size)
Size = MemVT.getStoreSize();
MachineFunction &MF = getMachineFunction();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, Flags, Size, Alignment, AAInfo, /*Ranges*/ nullptr,
+ /*SSID*/ SyncScope::System, Ordering, Ordering);
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2bdf48643edc3b..1266eec223aa5c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5077,9 +5077,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops,
- Info.memVT, MPI, Info.align, Info.flags,
- Info.size, I.getAAMetadata());
+ Result = DAG.getMemIntrinsicNode(
+ Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT, MPI, Info.align,
+ Info.flags, Info.size, I.getAAMetadata(), Info.ordering);
} else if (!HasChain) {
Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
} else if (!I.getType()->isVoidTy()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aed9bffc551f47..a3cdc852387b8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
@@ -1916,6 +1917,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
+ CPol &= ~AMDGPU::CPol::ATOMIC_ORDERING;
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index fe4db0ebb0262d..6b9a9a5664f167 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -653,6 +653,31 @@ multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
}
}
+multiclass MIMG_NoSampler_Pseudo <mimgopc op, string asm, bit has_d16, bit mip = 0,
+ bit isResInfo = 0,
+ bit msaa = 0> {
+ def "" : MIMGBaseOpcode {
+ let Coordinates = !not(isResInfo);
+ let LodOrClampOrMip = mip;
+ let HasD16 = has_d16;
+ let MSAA = msaa;
+ }
+
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+ mayLoad = !not(isResInfo), isCodeGenOnly = 1 in {
+ let VDataDwords = 1 in
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>;
+ let VDataDwords = 2 in
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>;
+ let VDataDwords = 3 in
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>;
+ let VDataDwords = 4 in
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>;
+ }
+}
+
class MIMG_Store_Helper <mimgopc op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
@@ -1559,7 +1584,11 @@ defm IMAGE_ATOMIC_ADD_FLT : MIMG_Atomic <mimgopc<0x83, MIMG.NOP, MIMG.NOP,
defm IMAGE_ATOMIC_MIN_FLT : MIMG_Atomic <mimgopc<0x84, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_min_num_flt", 0, 1, "image_atomic_min_flt">;
defm IMAGE_ATOMIC_MAX_FLT : MIMG_Atomic <mimgopc<0x85, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_max_num_flt", 0, 1, "image_atomic_max_flt">;
+
+defm IMAGE_ATOMIC_LOAD : MIMG_NoSampler_Pseudo <mimgopc<0x00, 0x00, 0x00>, "image_load", 1>;
+
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
+
let OtherPredicates = [HasExtendedImageInsts] in {
defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x40, 0x21>, AMDGPUSample_cl>;
defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x1c, 0x22>, AMDGPUSample_d>;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 98310c3f70c4a7..eb5891bdf3102e 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -425,6 +425,7 @@ enum CPol {
TH_TYPE_STORE = 1 << 8, // TH_STORE policy
TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy
TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not
+ ATOMIC_ORDERING = 0x70, // Atomic ordering bits mask
// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5e1d7508503741..9efeb2359aa927 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17,8 +17,10 @@
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
@@ -39,6 +41,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -1235,6 +1238,18 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable;
+ if (RsrcIntr->IsImage) {
+ auto Idx = CI.arg_size() - 1;
+ unsigned OrderingArg =
+ cast<ConstantInt>(CI.getArgOperand(Idx))->getZExtValue();
+ auto Ordering = (OrderingArg & AMDGPU::CPol::ATOMIC_ORDERING) >> 4;
+ unsigned ClearedCPol = OrderingArg & ~AMDGPU::CPol::ATOMIC_ORDERING;
+ ConstantInt *CPol = ConstantInt::get(
+ IntegerType::getInt32Ty(CI.getContext()), ClearedCPol);
+ const_cast<CallInst &>(CI).setArgOperand(Idx, CPol);
+ Info.ordering = static_cast<AtomicOrdering>(Ordering);
+ }
+
switch (IntrID) {
default:
// XXX - Should this be volatile without known ordering?
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f62e808b33e42b..3f851d32ace7fb 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -71,16 +72,17 @@ enum class SIAtomicAddrSpace {
LDS = 1u << 1,
SCRATCH = 1u << 2,
GDS = 1u << 3,
- OTHER = 1u << 4,
+ BUFFER_RESOURCE = 1u << 4,
+ OTHER = 1u << 5,
/// The address spaces that can be accessed by a FLAT instruction.
FLAT = GLOBAL | LDS | SCRATCH,
/// The address spaces that support atomic instructions.
- ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+ ATOMIC = GLOBAL | LDS | SCRATCH | GDS | BUFFER_RESOURCE,
/// All address spaces.
- ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+ ALL = GLOBAL | LDS | SCRATCH | GDS | BUFFER_RESOURCE | OTHER,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -712,6 +714,8 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::SCRATCH;
if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
+ if (AS == AMDGPUAS::BUFFER_RESOURCE)
+ return SIAtomicAddrSpace::BUFFER_RESOURCE;
return SIAtomicAddrSpace::OTHER;
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-image-load.ll b/llvm/test/CodeGen/AMDGPU/atomic-image-load.ll
new file mode 100644
index 00000000000000..26271aeef4f97f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomic-image-load.ll
@@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GFX10
+; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GFX10
+
+define amdgpu_ps void @test(<8 x i32> inreg %load, <8 x i32> inreg %store) {
+; GFX10-LABEL: test:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: test:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %data0 = call float @llvm.amdgcn.image.atomic.load.1d.f32.i32(i32 1, i32 0, <8 x i32> %load, i32 0, i32 112)
+ ret void
+}
+
+define amdgpu_ps <2 x float> @load_1d_v2f32(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX10-LABEL: load_1d_v2f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_1d_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: load_1d_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: ; return to shader part epilog
+ %v = call <2 x float> @llvm.amdgcn.image.atomic.load.1d.v2f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 64)
+ ret <2 x float> %v
+}
+
+define amdgpu_ps void @load_1d_v3f32(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX10-LABEL: load_1d_v3f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: load_1d_v3f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_1d_v3f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <3 x float> @llvm.amdgcn.image.atomic.load.1d.v3f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 64)
+ ret void
+}
+
+define amdgpu_ps void @load_1d_v4f32(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX10-LABEL: load_1d_v4f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: load_1d_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_1d_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <4 x float> @llvm.amdgcn.image.atomic.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 112)
+ ret void
+}
+
+define amdgpu_ps void @load_2d_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; GFX10-LABEL: load_2d_v4f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: load_2d_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_2d_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <4 x float> @llvm.amdgcn.image.atomic.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 112)
+ ret void
+}
+
+define amdgpu_ps void @load_2darraymsaa_v4f32(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
+; GFX9-LABEL: load_2darraymsaa_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, 0x5040100
+; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8
+; GFX9-NEXT: v_perm_b32 v1, v1, v0, s8
+; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm glc a16 da
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_2darraymsaa_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
+; GFX11-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc a16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <4 x float> @llvm.amdgcn.image.atomic.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 64)
+ ret void
+}
+
+define amdgpu_ps void @load_2darraymsaa_v4f32_i32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+; GFX10-LABEL: load_2darraymsaa_v4f32_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: load_2darraymsaa_v4f32_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm glc da
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_2darraymsaa_v4f32_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <4 x float> @llvm.amdgcn.image.atomic.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 64)
+ ret void
+}
+
+define amdgpu_ps void @load_3d_v4f32(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) {
+; GFX9-LABEL: load_3d_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, 0x5040100
+; GFX9-NEXT: v_perm_b32 v1, v1, v0, s8
+; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm glc a16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_3d_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
+; GFX11-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm glc a16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <4 x float> @llvm.amdgcn.image.atomic.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 64)
+ ret void
+}
+
+define amdgpu_ps void @load_3d_v4f32_i32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+; GFX10-LABEL: load_3d_v4f32_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: load_3d_v4f32_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: load_3d_v4f32_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %v = call <4 x float> @llvm.amdgcn.image.atomic.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 112)
+ ret void
+}
+
+declare float @llvm.amdgcn.image.atomic.load.1d.f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg)
+declare <2 x float> @llvm.amdgcn.image.atomic.load.1d.v2f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg)
+declare <3 x float> @llvm.amdgcn.image.atomic.load.1d.v3f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg)
+declare <4 x float> @llvm.amdgcn.image.atomic.load.1d.v4f32.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg)
+
+declare <4 x float> @llvm.amdgcn.image.atomic.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
+
+declare <4 x float> @llvm.amdgcn.image.atomic.load.2darraymsaa.v4f32.i16(i32 immarg, i16, i16, i16, i16, <8 x i32>, i32 immarg, i32 immarg)
+
+declare <4 x float> @llvm.amdgcn.image.atomic.load.2darraymsaa.v4f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
+
+declare <4 x float> @llvm.amdgcn.image.atomic.load.3d.v4f32.i16(i32 immarg, i16, i16, i16, <8 x i32>, i32 immarg, i32 immarg) #0
diff --git a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
index b35de032030043..b12fd23bf6a93a 100644
--- a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
+++ b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
@@ -85,15 +85,10 @@ entry:
}
define amdgpu_ps void @store_buffer(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-; FORCESC0SC1-LABEL: store_buffer:
-; FORCESC0SC1: ; %bb.0: ; %main_body
-; FORCESC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen sc0 sc1
-; FORCESC0SC1-NEXT: s_endpgm
-;
-; NOSC0SC1-LABEL: store_buffer:
-; NOSC0SC1: ; %bb.0: ; %main_body
-; NOSC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen
-; NOSC0SC1-NEXT: s_endpgm
+; GCN-LABEL: store_buffer:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen
+; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
ret void
More information about the llvm-commits
mailing list