[llvm] dee3190 - [AMDGPU] Add llvm.amdgcn.global.load.lds intrinsic
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue May 17 12:35:42 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-05-17T12:35:27-07:00
New Revision: dee3190293fef40d08105936295a26f25d86755a
URL: https://github.com/llvm/llvm-project/commit/dee3190293fef40d08105936295a26f25d86755a
DIFF: https://github.com/llvm/llvm-project/commit/dee3190293fef40d08105936295a26f25d86755a.diff
LOG: [AMDGPU] Add llvm.amdgcn.global.load.lds intrinsic
Differential Revision: https://reviews.llvm.org/D125279
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index dac82e076fe1e..f5ccdeaefb36d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1811,6 +1811,25 @@ def int_amdgcn_perm :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+//===----------------------------------------------------------------------===//
+// GFX9 Intrinsics
+//===----------------------------------------------------------------------===//
+
+class AMDGPUGlobalLoadLDS : Intrinsic <
+ [],
+ [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from
+ LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to
+ llvm_i32_ty, // Data byte size: 1/2/4
+ llvm_i32_ty, // imm offset (applied to both global and LDS address)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
+ // bit 1 = slc/sc1,
+ // bit 2 = dlc on gfx10+))
+ // bit 4 = scc/nt on gfx90a+))
+ [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>],
+ "", [SDNPMemOperand]>;
+def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
+
//===----------------------------------------------------------------------===//
// GFX10 Intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ba8a498622e74..ef1bbd4e9b4fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1783,6 +1783,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
return selectBufferLoadLds(I);
+ case Intrinsic::amdgcn_global_load_lds:
+ return selectGlobalLoadLds(I);
default: {
return selectImpl(I, *CoverageInfo);
}
@@ -3149,6 +3151,106 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+/// Match a zero extend from a 32-bit value to 64-bits.
+static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+ Register ZExtSrc;
+ if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+
+ // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return false;
+
+ if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ return Def->getOperand(1).getReg();
+ }
+
+ return Register();
+}
+
+bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
+ unsigned Opc;
+ unsigned Size = MI.getOperand(3).getImm();
+
+ switch (Size) {
+ default:
+ return false;
+ case 1:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+ break;
+ case 4:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+ break;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .add(MI.getOperand(2));
+
+ Register Addr = MI.getOperand(1).getReg();
+ Register VOffset;
+ // Try to split SAddr and VOffset. Global and LDS pointers share the same
+ // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+ if (!isSGPR(Addr)) {
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (isSGPR(AddrDef->Reg)) {
+ Addr = AddrDef->Reg;
+ } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+ if (SAddr && isSGPR(SAddr)) {
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+ if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ Addr = SAddr;
+ VOffset = Off;
+ }
+ }
+ }
+ }
+
+ if (isSGPR(Addr)) {
+ Opc = AMDGPU::getGlobalSaddrOp(Opc);
+ if (!VOffset) {
+ VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+ .addImm(0);
+ }
+ }
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
+ .addReg(Addr);
+
+ if (isSGPR(Addr))
+ MIB.addReg(VOffset);
+
+ MIB.add(MI.getOperand(4)) // offset
+ .add(MI.getOperand(5)); // cpol
+
+ MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = MI.getOperand(4).getImm();
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO =
+ MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), Align(4));
+
+ MIB.setMemRefs({LoadMMO, StoreMMO});
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
MI.removeOperand(1);
@@ -3687,24 +3789,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
}};
}
-/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
- Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
-
- // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
- if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
- return false;
-
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
- return Def->getOperand(1).getReg();
- }
-
- return Register();
-}
-
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93fd6ef6641b9..04c769a7ce924 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -144,6 +144,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
MachineOperand &DataOp) const;
bool selectBufferLoadLds(MachineInstr &MI) const;
+ bool selectGlobalLoadLds(MachineInstr &MI) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8e1d2c03d43e7..4f44827e112e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3026,6 +3026,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
return;
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 2);
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4517,6 +4522,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cd13f7546f935..8080e9275334d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1318,6 +1318,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
default:
return false;
}
@@ -8318,6 +8326,81 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(Load, 0);
}
+ case Intrinsic::amdgcn_global_load_lds: {
+ unsigned Opc;
+ unsigned Size = Op->getConstantOperandVal(4);
+ switch (Size) {
+ default:
+ return SDValue();
+ case 1:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+ break;
+ case 4:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+ break;
+ }
+
+ auto *M = cast<MemSDNode>(Op);
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+ SmallVector<SDValue, 6> Ops;
+
+ SDValue Addr = Op.getOperand(2); // Global ptr
+ SDValue VOffset;
+ // Try to split SAddr and VOffset. Global and LDS pointers share the same
+ // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+ if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
+ SDValue LHS = Addr.getOperand(0);
+ SDValue RHS = Addr.getOperand(1);
+
+ if (LHS->isDivergent())
+ std::swap(LHS, RHS);
+
+ if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOperand(0).getValueType() == MVT::i32) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ Addr = LHS;
+ VOffset = RHS.getOperand(0);
+ }
+ }
+
+ Ops.push_back(Addr);
+ if (!Addr->isDivergent()) {
+ Opc = AMDGPU::getGlobalSaddrOp(Opc);
+ if (!VOffset)
+ VOffset = SDValue(
+ DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+ DAG.getTargetConstant(0, DL, MVT::i32)), 0);
+ Ops.push_back(VOffset);
+ }
+
+ Ops.push_back(Op.getOperand(5)); // Offset
+ Ops.push_back(Op.getOperand(6)); // CPol
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
+
+ MachineMemOperand *LoadMMO = M->getMemOperand();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = Op->getConstantOperandVal(5);
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), Align(4));
+
+ auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+ return SDValue(Load, 0);
+ }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7fcae17a0efa5..6d2ce9edb184a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -435,6 +435,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ if (DataOpIdx == -1) // LDS DMA
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
new file mode 100644
index 0000000000000..c0bb6f64c9fc2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX90A
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX940
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900-GISEL
+
+declare void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+define amdgpu_ps void @global_load_lds_dword_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) {
+; GFX900-LABEL: global_load_lds_dword_vaddr:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: v_readfirstlane_b32 s0, v2
+; GFX900-NEXT: s_mov_b32 m0, s0
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dword v[0:1], off offset:16 glc lds
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_load_lds_dword_vaddr:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v2
+; GFX90A-NEXT: s_mov_b32 m0, s0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v[0:1], off offset:16 glc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX940-LABEL: global_load_lds_dword_vaddr:
+; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: v_readfirstlane_b32 s0, v2
+; GFX940-NEXT: s_mov_b32 m0, s0
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: global_load_lds_dword v[0:1], off offset:16 sc0
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_vaddr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_readfirstlane_b32 s0, v2
+; GFX10-NEXT: s_mov_b32 m0, s0
+; GFX10-NEXT: global_load_dword v[0:1], off offset:16 glc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX900-GISEL-LABEL: global_load_lds_dword_vaddr:
+; GFX900-GISEL: ; %bb.0: ; %main_body
+; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2
+; GFX900-GISEL-NEXT: s_nop 4
+; GFX900-GISEL-NEXT: global_load_dword v[0:1], off offset:16 glc lds
+; GFX900-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 4, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_saddr(i8 addrspace(1)* nocapture inreg %gptr, i8 addrspace(3)* nocapture %lptr) {
+; GFX900-LABEL: global_load_lds_dword_saddr:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: v_readfirstlane_b32 s2, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-NEXT: s_mov_b32 m0, s2
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dword v1, s[0:1] offset:32 slc lds
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_load_lds_dword_saddr:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_readfirstlane_b32 s2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v1, s[0:1] offset:32 slc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX940-LABEL: global_load_lds_dword_saddr:
+; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: v_readfirstlane_b32 s2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NEXT: s_mov_b32 m0, s2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_saddr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v0, s[0:1] offset:32 slc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX900-GISEL-LABEL: global_load_lds_dword_saddr:
+; GFX900-GISEL: ; %bb.0: ; %main_body
+; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-GISEL-NEXT: s_nop 3
+; GFX900-GISEL-NEXT: global_load_dword v0, s[0:1] offset:32 slc lds
+; GFX900-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 4, i32 32, i32 2)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_saddr_and_vaddr(i8 addrspace(1)* nocapture inreg %gptr, i8 addrspace(3)* nocapture %lptr, i32 %voffset) {
+; GFX900-LABEL: global_load_lds_dword_saddr_and_vaddr:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: v_readfirstlane_b32 s2, v0
+; GFX900-NEXT: s_mov_b32 m0, s2
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dword v1, s[0:1] offset:48 lds
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_load_lds_dword_saddr_and_vaddr:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_readfirstlane_b32 s2, v0
+; GFX90A-NEXT: s_mov_b32 m0, s2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dword v1, s[0:1] offset:48 scc lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX940-LABEL: global_load_lds_dword_saddr_and_vaddr:
+; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: v_readfirstlane_b32 s2, v0
+; GFX940-NEXT: s_mov_b32 m0, s2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_saddr_and_vaddr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-NEXT: s_mov_b32 m0, s2
+; GFX10-NEXT: global_load_dword v1, s[0:1] offset:48 lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX900-GISEL-LABEL: global_load_lds_dword_saddr_and_vaddr:
+; GFX900-GISEL: ; %bb.0: ; %main_body
+; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GFX900-GISEL-NEXT: s_nop 4
+; GFX900-GISEL-NEXT: global_load_dword v1, s[0:1] offset:48 lds
+; GFX900-GISEL-NEXT: s_endpgm
+main_body:
+ %voffset.64 = zext i32 %voffset to i64
+ %gep = getelementptr i8, i8 addrspace(1)* %gptr, i64 %voffset.64
+ call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gep, i8 addrspace(3)* %lptr, i32 4, i32 48, i32 16)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_ushort_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) {
+; GFX900-LABEL: global_load_lds_ushort_vaddr:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: v_readfirstlane_b32 s0, v2
+; GFX900-NEXT: s_mov_b32 m0, s0
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_ushort v[0:1], off lds
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_load_lds_ushort_vaddr:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v2
+; GFX90A-NEXT: s_mov_b32 m0, s0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_ushort v[0:1], off lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX940-LABEL: global_load_lds_ushort_vaddr:
+; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: v_readfirstlane_b32 s0, v2
+; GFX940-NEXT: s_mov_b32 m0, s0
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: global_load_lds_ushort v[0:1], off
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_ushort_vaddr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_readfirstlane_b32 s0, v2
+; GFX10-NEXT: s_mov_b32 m0, s0
+; GFX10-NEXT: global_load_ushort v[0:1], off dlc lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX900-GISEL-LABEL: global_load_lds_ushort_vaddr:
+; GFX900-GISEL: ; %bb.0: ; %main_body
+; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2
+; GFX900-GISEL-NEXT: s_nop 4
+; GFX900-GISEL-NEXT: global_load_ushort v[0:1], off lds
+; GFX900-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 2, i32 0, i32 4)
+ ret void
+}
+
+define amdgpu_ps void @global_load_lds_ubyte_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) {
+; GFX900-LABEL: global_load_lds_ubyte_vaddr:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: v_readfirstlane_b32 s0, v2
+; GFX900-NEXT: s_mov_b32 m0, s0
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_ubyte v[0:1], off lds
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_load_lds_ubyte_vaddr:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v2
+; GFX90A-NEXT: s_mov_b32 m0, s0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_ubyte v[0:1], off lds
+; GFX90A-NEXT: s_endpgm
+;
+; GFX940-LABEL: global_load_lds_ubyte_vaddr:
+; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: v_readfirstlane_b32 s0, v2
+; GFX940-NEXT: s_mov_b32 m0, s0
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: global_load_lds_ubyte v[0:1], off
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_load_lds_ubyte_vaddr:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_readfirstlane_b32 s0, v2
+; GFX10-NEXT: s_mov_b32 m0, s0
+; GFX10-NEXT: global_load_ubyte v[0:1], off lds
+; GFX10-NEXT: s_endpgm
+;
+; GFX900-GISEL-LABEL: global_load_lds_ubyte_vaddr:
+; GFX900-GISEL: ; %bb.0: ; %main_body
+; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2
+; GFX900-GISEL-NEXT: s_nop 4
+; GFX900-GISEL-NEXT: global_load_ubyte v[0:1], off lds
+; GFX900-GISEL-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 1, i32 0, i32 0)
+ ret void
+}
More information about the llvm-commits
mailing list