[llvm] r340684 - [AMDGPU] Add support for multi-dword s.buffer.load intrinsic
Tim Renouf via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 25 07:53:18 PDT 2018
Author: tpr
Date: Sat Aug 25 07:53:17 2018
New Revision: 340684
URL: http://llvm.org/viewvc/llvm-project?rev=340684&view=rev
Log:
[AMDGPU] Add support for multi-dword s.buffer.load intrinsic
Summary:
Patch by Marek Olsak and David Stuttard, both of AMD.
This adds a new amdgcn intrinsic supporting s.buffer.load, in particular
multiple dword variants. These are convenient to use from some front-end
implementations.
Also modified the existing llvm.SI.load.const intrinsic to common up the
underlying implementation.
This modification also requires that we can lower to non-uniform loads correctly
by splitting larger dword variants into sizes supported by the non-uniform
versions of the load.
V2: Addressed minor review comments.
V3: i1 glc is now i32 cachepolicy for consistency with buffer and
tbuffer intrinsics, plus fixed formatting issue.
V4: Added glc test.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D51098
Change-Id: I83a6e00681158bb243591a94a51c7baa445f169b
Added:
llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/SMInstructions.td
llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Sat Aug 25 07:53:17 2018
@@ -802,6 +802,14 @@ class AMDGPUBufferLoad : Intrinsic <
def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
def int_amdgcn_buffer_load : AMDGPUBufferLoad;
+def int_amdgcn_s_buffer_load : Intrinsic <
+ [llvm_anyint_ty],
+ [llvm_v4i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // byte offset(SGPR/VGPR/imm)
+ llvm_i32_ty], // cachepolicy(imm; bit 0 = glc)
+ [IntrNoMem]>,
+ AMDGPURsrcIntrinsic<0>;
+
class AMDGPUBufferStore : Intrinsic <
[],
[llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Sat Aug 25 07:53:17 2018
@@ -4170,6 +4170,7 @@ const char* AMDGPUTargetLowering::getTar
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
+ NODE_NAME_CASE(SBUFFER_LOAD)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Sat Aug 25 07:53:17 2018
@@ -486,6 +486,7 @@ enum NodeType : unsigned {
BUFFER_LOAD,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_D16,
+ SBUFFER_LOAD,
BUFFER_STORE,
BUFFER_STORE_FORMAT,
BUFFER_STORE_FORMAT_D16,
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Sat Aug 25 07:53:17 2018
@@ -4921,8 +4921,9 @@ SDValue SITargetLowering::LowerINTRINSIC
MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
- Op.getOperand(1),
- Op.getOperand(2)
+ Op.getOperand(1), // Ptr
+ Op.getOperand(2), // Offset
+ DAG.getTargetConstant(0, DL, MVT::i1) // glc
};
MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -4930,7 +4931,26 @@ SDValue SITargetLowering::LowerINTRINSIC
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+ SDVTList VTList = DAG.getVTList(MVT::i32);
+ SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+ VTList, Ops, MVT::i32, MMO);
+
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ SDValue Ops[] = {
+ Op.getOperand(1), // Ptr
+ Op.getOperand(2), // Offset
+ DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
+ };
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), VT.getStoreSize());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
Op->getVTList(), Ops, VT, MMO);
}
case Intrinsic::amdgcn_fdiv_fast:
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Sat Aug 25 07:53:17 2018
@@ -3904,8 +3904,34 @@ void SIInstrInfo::moveToVALU(MachineInst
Inst.eraseFromParent();
continue;
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
- unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+ unsigned VDst;
+ unsigned NewOpcode;
+
+ switch(Opcode) {
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ splitScalarBuffer(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+ }
+
const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
unsigned Offset = 0;
@@ -3956,7 +3982,7 @@ void SIInstrInfo::moveToVALU(MachineInst
MachineInstr *NewInstr =
BuildMI(*MBB, Inst, Inst.getDebugLoc(),
- get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
+ get(NewOpcode), VDst)
.add(*VAddr) // vaddr
.add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
.addImm(0) // soffset
@@ -4457,6 +4483,73 @@ void SIInstrInfo::splitScalar64BitBFE(Se
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Inst;
+ auto &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
+ MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
+ MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
+ MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
+
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ unsigned Count = 0;
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+
+ switch(Opcode) {
+ default:
+ return;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ Count = 2;
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ Count = 4;
+ break;
+ }
+
+ // FIXME: Should also attempt to build VAddr and Offset like the non-split
+ // case (see call site for this function)
+
+ // Create a vector of result registers
+ SmallVector<unsigned, 8> ResultRegs;
+ for (unsigned i = 0; i < Count ; ++i) {
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
+ .addReg(Offset.getReg()) // offset
+ .addReg(Rsrc.getReg()) // rsrc
+ .addImm(0) // soffset
+ .addImm(i << 4) // inst_offset
+ .addImm(Glc.getImm()) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addMemOperand(*Inst.memoperands_begin());
+ // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
+ auto &NewDestOp = NewMI.getOperand(0);
+ for (unsigned i = 0 ; i < 4 ; i++)
+ ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
+ RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
+ }
+ // Create a new combined result to replace original with
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
+ get(TargetOpcode::REG_SEQUENCE), FullDestReg);
+
+ for (unsigned i = 0 ; i < Count * 4 ; ++i) {
+ CombinedResBuilder
+ .addReg(ResultRegs[i])
+ .addImm(RI.getSubRegFromChannel(i));
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
unsigned DstReg,
MachineRegisterInfo &MRI,
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Sat Aug 25 07:53:17 2018
@@ -101,6 +101,8 @@ private:
MachineInstr &Inst) const;
void splitScalar64BitBFE(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
void movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Sat Aug 25 07:53:17 2018
@@ -40,9 +40,9 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
- SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
- [SDNPMayLoad, SDNPMemOperand]
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
+ SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+ [SDNPMayLoad, SDNPMemOperand]
>;
def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
Modified: llvm/trunk/lib/Target/AMDGPU/SMInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SMInstructions.td?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SMInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SMInstructions.td Sat Aug 25 07:53:17 2018
@@ -409,6 +409,22 @@ multiclass SMRD_Pattern <string Instr, V
>;
}
+multiclass SMLoad_Pattern <string Instr, ValueType vt> {
+ // 1. Offset as an immediate
+ // name this pattern to reuse AddedComplexity on CI
+ def _IMM : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
+ >;
+
+ // 2. Offset loaded in an 32bit SGPR
+ def : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
+ >;
+}
+
+
let OtherPredicates = [isSICI] in {
def : GCNPat <
(i64 (readcyclecounter)),
@@ -427,18 +443,12 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX4",
defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-// 1. Offset as an immediate
-def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI
- (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : GCNPat <
- (SIload_constant v4i32:$sbase, i32:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
->;
-
+// Name the pattern to reuse AddedComplexity on CI
+defm SM_LOAD_PATTERN : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
} // End let AddedComplexity = 100
let OtherPredicates = [isVI] in {
@@ -757,7 +767,7 @@ class SMRD_Real_ci <bits<5> op, SM_Pseud
def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
-let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+let AddedComplexity = SM_LOAD_PATTERN_IMM.AddedComplexity in {
class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
@@ -771,11 +781,17 @@ def : SMRD_Pattern_ci <"S_LOAD_DWORDX4",
def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
-def : GCNPat <
- (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+class SMLoad_Pattern_ci <string Instr, ValueType vt> : GCNPat <
+ (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
+ (!cast<InstSI>(Instr) $sbase, $offset, (as_i1imm $glc))> {
let OtherPredicates = [isCI]; // should this be isCIOnly?
}
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORD_IMM_ci", i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX2_IMM_ci", v2i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX4_IMM_ci", v4i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX8_IMM_ci", v8i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX16_IMM_ci", v16i32>;
+
} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
Modified: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smrd.ll?rev=340684&r1=340683&r2=340684&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll Sat Aug 25 07:53:17 2018
@@ -106,7 +106,7 @@ main_body:
; GCN-LABEL: {{^}}smrd_load_const0:
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
@@ -119,13 +119,18 @@ main_body:
; offset.
; GCN-LABEL: {{^}}smrd_load_const1:
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff
; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc
+define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
%tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1)
+ %s.buffer.float = bitcast i32 %s.buffer to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
ret void
}
@@ -135,14 +140,20 @@ main_body:
; GCN-LABEL: {{^}}smrd_load_const2:
; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
%tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0)
+ %s.buffer.float = bitcast i32 %s.buffer to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
ret void
}
@@ -150,14 +161,20 @@ main_body:
; GCN-LABEL: {{^}}smrd_load_const3:
; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
%tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0)
+ %s.buffer.float = bitcast i32 %s.buffer to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
ret void
}
@@ -165,14 +182,95 @@ main_body:
; GCN-LABEL: {{^}}smrd_load_const4:
; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
%tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0)
+ %s.buffer.float = bitcast i32 %s.buffer to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
+ ret void
+}
+
+; dwordx2 s.buffer.load
+; GCN-LABEL: {{^}}s_buffer_load_dwordx2:
+; VIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
+; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
+define amdgpu_ps void @s_buffer_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+main_body:
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0)
+ %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0
+ %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
+ %s.buffer.1 = extractelement <2 x i32> %s.buffer, i32 1
+ %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.0.float, float %s.buffer.1.float, i1 true, i1 true) #0
+ ret void
+}
+
+; dwordx4 s.buffer.load
+; GCN-LABEL: {{^}}s_buffer_load_dwordx4:
+; VIGFX9: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
+; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
+define amdgpu_ps void @s_buffer_load_dwordx4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+main_body:
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0)
+ %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0
+ %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
+ %s.buffer.1 = extractelement <4 x i32> %s.buffer, i32 1
+ %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
+ %s.buffer.2 = extractelement <4 x i32> %s.buffer, i32 2
+ %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
+ %s.buffer.3 = extractelement <4 x i32> %s.buffer, i32 3
+ %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
+ ret void
+}
+
+; dwordx8 s.buffer.load
+; GCN-LABEL: {{^}}s_buffer_load_dwordx8:
+; VIGFX9: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
+; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
+define amdgpu_ps void @s_buffer_load_dwordx8(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+main_body:
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0)
+ %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0
+ %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
+ %s.buffer.1 = extractelement <8 x i32> %s.buffer, i32 2
+ %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
+ %s.buffer.2 = extractelement <8 x i32> %s.buffer, i32 5
+ %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
+ %s.buffer.3 = extractelement <8 x i32> %s.buffer, i32 7
+ %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
+ ret void
+}
+
+; dwordx16 s.buffer.load
+; GCN-LABEL: {{^}}s_buffer_load_dwordx16:
+; VIGFX9: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
+; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
+define amdgpu_ps void @s_buffer_load_dwordx16(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+main_body:
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0)
+ %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0
+ %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
+ %s.buffer.1 = extractelement <16 x i32> %s.buffer, i32 3
+ %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
+ %s.buffer.2 = extractelement <16 x i32> %s.buffer, i32 12
+ %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
+ %s.buffer.3 = extractelement <16 x i32> %s.buffer, i32 15
+ %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
ret void
}
@@ -339,10 +437,90 @@ ret_block:
br i1 %outer_br, label %.outer_loop_header, label %ret_block
}
+; SMRD load with a non-const offset
+; GCN-LABEL: {{^}}smrd_load_nonconst0:
+; SIVIGFX9: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; SIVIGFX9: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_ps void @smrd_load_nonconst0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
+main_body:
+ %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+ %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+ %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
+ %s.buffer.float = bitcast i32 %s.buffer to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
+ ret void
+}
+
+; SMRD load with a non-const non-uniform offset
+; GCN-LABEL: {{^}}smrd_load_nonconst1:
+; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; GCN: s_endpgm
+define amdgpu_ps void @smrd_load_nonconst1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
+main_body:
+ %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+ %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+ %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
+ %s.buffer.float = bitcast i32 %s.buffer to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
+ ret void
+}
+
+; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
+; GCN-LABEL: {{^}}smrd_load_nonconst2:
+; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; SIVIGFX9: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; CI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; GCN: s_endpgm
+define amdgpu_ps void @smrd_load_nonconst2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
+main_body:
+ %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
+ %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+ %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
+ %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1
+ %s.buffer.float = bitcast i32 %s.buffer.elt to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
+ ret void
+}
+
+; SMRD load dwordx2
+; GCN-LABEL: {{^}}smrd_load_dwordx2:
+; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; CI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_ps void @smrd_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
+main_body:
+ %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+ %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
+ %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float>
+ %r.1 = extractelement <2 x float> %s.buffer.float, i32 0
+ %r.2 = extractelement <2 x float> %s.buffer.float, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r.1, float %r.1, float %r.1, float %r.2, i1 true, i1 true) #0
+ ret void
+}
+
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
+declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
+declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
+declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32)
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Added: llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll?rev=340684&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll Sat Aug 25 07:53:17 2018
@@ -0,0 +1,36 @@
+; RUN: opt < %s -S -mtriple=amdgcn-- -early-cse | FileCheck %s
+
+; CHECK-LABEL: @no_cse
+; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
+; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
+define void @no_cse(i32 addrspace(1)* %out, <4 x i32> %in) {
+ %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
+ %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
+ %c = add i32 %a, %b
+ store i32 %c, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @cse_zero_offset
+; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
+; CHECK: add i32 [[CSE]], [[CSE]]
+define void @cse_zero_offset(i32 addrspace(1)* %out, <4 x i32> %in) {
+ %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
+ %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0)
+ %c = add i32 %a, %b
+ store i32 %c, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @cse_nonzero_offset
+; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
+; CHECK: add i32 [[CSE]], [[CSE]]
+define void @cse_nonzero_offset(i32 addrspace(1)* %out, <4 x i32> %in) {
+ %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
+ %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0)
+ %c = add i32 %a, %b
+ store i32 %c, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> nocapture, i32, i32)
More information about the llvm-commits
mailing list