[llvm] 791ec1c - [AMDGPU] Add intrinsics llvm.amdgcn.{raw|struct}.buffer.load.lds
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue May 17 10:32:23 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-05-17T10:32:13-07:00
New Revision: 791ec1c68e3bbf017ace434a162e61806fc03b47
URL: https://github.com/llvm/llvm-project/commit/791ec1c68e3bbf017ace434a162e61806fc03b47
DIFF: https://github.com/llvm/llvm-project/commit/791ec1c68e3bbf017ace434a162e61806fc03b47.diff
LOG: [AMDGPU] Add intrinsics llvm.amdgcn.{raw|struct}.buffer.load.lds
Differential Revision: https://reviews.llvm.org/D124884
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index df552982a4ee9..dac82e076fe1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1270,6 +1270,40 @@ class AMDGPUBufferAtomicFP : Intrinsic <
// Legacy form of the intrinsic. raw and struct forms should be preferred.
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
+
+class AMDGPURawBufferLoadLDS : Intrinsic <
+ [],
+ [llvm_v4i32_ty, // rsrc(SGPR)
+ LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
+ llvm_i32_ty, // Data byte size: 1/2/4
+ llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
+ // bit 1 = slc,
+ // bit 2 = dlc on gfx10+))
+ // swizzled buffer (bit 3 = swz))
+ [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
+ ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
+
+class AMDGPUStructBufferLoadLDS : Intrinsic <
+ [],
+ [llvm_v4i32_ty, // rsrc(SGPR)
+ LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
+ llvm_i32_ty, // Data byte size: 1/2/4
+ llvm_i32_ty, // vindex(VGPR)
+ llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
+ // bit 1 = slc,
+ // bit 2 = dlc on gfx10+))
+ // swizzled buffer (bit 3 = swz))
+ [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
+ ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
+def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
+
} // defset AMDGPUBufferIntrinsics
// Uses that do not set the done bit should set IntrWriteMem on the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index bdcc78bc7c97c..ba8a498622e74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1780,6 +1780,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ return selectBufferLoadLds(I);
default: {
return selectImpl(I, *CoverageInfo);
}
@@ -3054,6 +3057,98 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
+ unsigned Opc;
+ unsigned Size = MI.getOperand(3).getImm();
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == 9;
+ Register VIndex;
+ int OpOffset = 0;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(4).getReg();
+ OpOffset = 1;
+ }
+
+ Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+ Optional<ValueAndVReg> MaybeVOffset =
+ getIConstantVRegValWithLookThrough(VOffset, *MRI);
+ const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
+
+ switch (Size) {
+ default:
+ return false;
+ case 1:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+ break;
+ case 2:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+ break;
+ case 4:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+ break;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .add(MI.getOperand(2));
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
+
+ if (HasVIndex && HasVOffset) {
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
+ .addReg(VIndex)
+ .addImm(AMDGPU::sub0)
+ .addReg(VOffset)
+ .addImm(AMDGPU::sub1);
+
+ MIB.addReg(IdxReg);
+ } else if (HasVIndex) {
+ MIB.addReg(VIndex);
+ } else if (HasVOffset) {
+ MIB.addReg(VOffset);
+ }
+
+ MIB.add(MI.getOperand(1)); // rsrc
+ MIB.add(MI.getOperand(5 + OpOffset)); // soffset
+ MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
+ unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
+ MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
+ MIB.addImm((Aux >> 3) & 1); // swz
+
+ MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ StorePtrI.V = nullptr;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+
+ MachineMemOperand *StoreMMO =
+ MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), LoadMMO->getBaseAlign());
+
+ MIB.setMemRefs({LoadMMO, StoreMMO});
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
MI.removeOperand(1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index e8e7f75831110..93fd6ef6641b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -143,6 +143,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
MachineOperand &DataOp) const;
+ bool selectBufferLoadLds(MachineInstr &MI) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2617558b25750..8e1d2c03d43e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3012,6 +3012,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 2);
return;
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
+ return;
+ }
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4436,6 +4450,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds: {
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store: {
@@ -4454,6 +4475,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 429f03f27c1fe..cd13f7546f935 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1191,6 +1191,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
+
+ switch (IntrID) {
+ default:
+ break;
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ return true;
+ }
+ }
}
return true;
}
@@ -8228,6 +8239,85 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ unsigned Opc;
+ bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
+ unsigned OpOffset = HasVIndex ? 1 : 0;
+ SDValue VOffset = Op.getOperand(5 + OpOffset);
+ auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
+ bool HasVOffset = !CVOffset || !CVOffset->isZero();
+ unsigned Size = Op->getConstantOperandVal(4);
+
+ switch (Size) {
+ default:
+ return SDValue();
+ case 1:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+ break;
+ case 2:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+ break;
+ case 4:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+ break;
+ }
+
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+ SmallVector<SDValue, 8> Ops;
+
+ if (HasVIndex && HasVOffset)
+ Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
+ { Op.getOperand(5), // VIndex
+ VOffset }));
+ else if (HasVIndex)
+ Ops.push_back(Op.getOperand(5));
+ else if (HasVOffset)
+ Ops.push_back(VOffset);
+
+ Ops.push_back(Op.getOperand(2)); // rsrc
+ Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
+ Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
+ unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
+ Ops.push_back(
+ DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
+ Ops.push_back(
+ DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
+
+ auto *M = cast<MemSDNode>(Op);
+ MachineMemOperand *LoadMMO = M->getMemOperand();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ StorePtrI.V = nullptr;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), LoadMMO->getBaseAlign());
+
+ auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
+ DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+ return SDValue(Load, 0);
+ }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b7b29210d429e..6c643ad1eb485 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1099,6 +1099,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
continue;
+ // No need to wait before load from VMEM to LDS.
+ if (mayWriteLDSThroughDMA(MI))
+ continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 44e8ccfd3ab26..7fcae17a0efa5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -385,6 +385,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ if (DataOpIdx == -1) // LDS DMA
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
new file mode 100644
index 0000000000000..6dfec8cb7fa33
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 lds
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 glc lds
+; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 slc lds
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
+ %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
+ %res = load float, float addrspace(3)* %ptr
+ ret float %res
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_imm_voffset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v0, 0x800
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 2048, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset) {
+; GCN-LABEL: buffer_load_lds_dword_v_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 inreg %soffset) {
+; GCN-LABEL: buffer_load_lds_dword_s_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword off, s[0:3], s5 lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
+; GCN-LABEL: buffer_load_lds_dword_vs_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
+; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+; GCN-LABEL: buffer_load_lds_ushort:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v0, 0x800
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_ushort v0, s[0:3], 0 offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 2048, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+; GCN-LABEL: buffer_load_lds_ubyte:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_ubyte off, s[0:3], 0 offset:2048 lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
new file mode 100644
index 0000000000000..b94ba8334b34d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL
+
+declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+; SDAG-LABEL: buffer_load_lds_dword:
+; SDAG: ; %bb.0: ; %main_body
+; SDAG-NEXT: v_mov_b32_e32 v0, 8
+; SDAG-NEXT: s_mov_b32 m0, s4
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
+; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
+; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
+; SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: ds_read_b32 v0, v0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: buffer_load_lds_dword:
+; GISEL: ; %bb.0: ; %main_body
+; GISEL-NEXT: s_mov_b32 m0, s4
+; GISEL-NEXT: v_mov_b32_e32 v0, 8
+; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
+; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
+; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: ds_read_b32 v0, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: ; return to shader part epilog
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
+ %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
+ %res = load float, float addrspace(3)* %ptr
+ ret float %res
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
+; GCN-LABEL: buffer_load_lds_dword_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset) {
+; GCN-LABEL: buffer_load_lds_dword_v_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 inreg %soffset) {
+; GCN-LABEL: buffer_load_lds_dword_s_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 idxen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GCN-LABEL: buffer_load_lds_dword_vs_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
+; GCN-LABEL: buffer_load_lds_ushort:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v1, 0x800
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
+; GCN-LABEL: buffer_load_lds_ubyte:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds
+; GCN-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
+ ret void
+}
More information about the llvm-commits
mailing list