[llvm] r309998 - AMDGPU: Pass special input registers to functions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 3 16:00:30 PDT 2017
Author: arsenm
Date: Thu Aug 3 16:00:29 2017
New Revision: 309998
URL: http://llvm.org/viewvc/llvm-project?rev=309998&view=rev
Log:
AMDGPU: Pass special input registers to functions
Added:
llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp Thu Aug 3 16:00:29 2017
@@ -41,7 +41,7 @@ unsigned AMDGPUCallLowering::lowerParame
unsigned Offset) const {
MachineFunction &MF = MIRBuilder.getMF();
- const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = *MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
@@ -49,7 +49,7 @@ unsigned AMDGPUCallLowering::lowerParame
LLT PtrType = getLLTForType(*PtrTy, DL);
unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
unsigned KernArgSegmentPtr =
- TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Thu Aug 3 16:00:29 2017
@@ -3582,6 +3582,49 @@ SDValue AMDGPUTargetLowering::CreateLive
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
+SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
+ EVT VT,
+ const SDLoc &SL,
+ int64_t Offset) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
+ auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
+ SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
+
+ return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+}
+
+SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
+ const SDLoc &SL,
+ SDValue Chain,
+ SDValue StackPtr,
+ SDValue ArgVal,
+ int64_t Offset) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
+ SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32);
+ SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset);
+
+ SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
+ MachineMemOperand::MODereferenceable);
+ return Store;
+}
+
+SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ EVT VT, const SDLoc &SL,
+ const ArgDescriptor &Arg) const {
+ assert(Arg && "Attempting to load missing argument");
+
+ if (Arg.isRegister())
+ return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
+ return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+}
+
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Thu Aug 3 16:00:29 2017
@@ -24,7 +24,7 @@ namespace llvm {
class AMDGPUMachineFunction;
class AMDGPUSubtarget;
-class MachineRegisterInfo;
+struct ArgDescriptor;
class AMDGPUTargetLowering : public TargetLowering {
private:
@@ -237,6 +237,25 @@ public:
return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
}
+ /// Similar to CreateLiveInRegister, except value maybe loaded from a stack
+ /// slot rather than passed in a register.
+ SDValue loadStackInputValue(SelectionDAG &DAG,
+ EVT VT,
+ const SDLoc &SL,
+ int64_t Offset) const;
+
+ SDValue storeStackInputValue(SelectionDAG &DAG,
+ const SDLoc &SL,
+ SDValue Chain,
+ SDValue StackPtr,
+ SDValue ArgVal,
+ int64_t Offset) const;
+
+ SDValue loadInputValue(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ EVT VT, const SDLoc &SL,
+ const ArgDescriptor &Arg) const;
+
enum ImplicitParameter {
FIRST_IMPLICIT,
GRID_DIM = FIRST_IMPLICIT,
Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Thu Aug 3 16:00:29 2017
@@ -38,6 +38,7 @@ void SIFrameLowering::emitFlatScratchIni
MachineBasicBlock &MBB) const {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// We don't need this if we only have spills since there is no user facing
// scratch.
@@ -55,7 +56,7 @@ void SIFrameLowering::emitFlatScratchIni
MachineBasicBlock::iterator I = MBB.begin();
unsigned FlatScratchInitReg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
@@ -64,7 +65,6 @@ void SIFrameLowering::emitFlatScratchIni
unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
// Do a 64-bit pointer add.
@@ -283,13 +283,13 @@ void SIFrameLowering::emitEntryFunctionP
}
// We need to insert initialization of the scratch resource descriptor.
- unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdCodeObjectV2(MF)) {
- PreloadedPrivateBufferReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ PreloadedPrivateBufferReg = MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Thu Aug 3 16:00:29 2017
@@ -45,6 +45,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
@@ -895,14 +896,19 @@ SDValue SITargetLowering::lowerKernArgPa
uint64_t Offset) const {
const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- unsigned InputPtrReg = TRI->getPreloadedValue(MF,
- SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ const ArgDescriptor *InputPtrReg;
+ const TargetRegisterClass *RC;
+
+ std::tie(InputPtrReg, RC)
+ = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
- MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+ MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
+
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Offset, SL, PtrVT));
}
@@ -1005,6 +1011,17 @@ SDValue SITargetLowering::lowerStackPara
return ArgValue;
}
+SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
+ const SIMachineFunctionInfo &MFI,
+ EVT VT,
+ AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
+ const ArgDescriptor *Reg;
+ const TargetRegisterClass *RC;
+
+ std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
+ return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
+}
+
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
@@ -1055,27 +1072,129 @@ static void processShaderInputArgs(Small
}
// Allocate special inputs passed in VGPRs.
-static void allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
if (Info.hasWorkItemIDX()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ unsigned Reg = AMDGPU::VGPR0;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ assert(Reg == AMDGPU::VGPR0);
+
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ unsigned Reg = AMDGPU::VGPR1;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
+ assert(Reg == AMDGPU::VGPR1);
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
- unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ unsigned Reg = AMDGPU::VGPR2;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+
+ assert(Reg == AMDGPU::VGPR2);
CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ }
+}
+
+// Try to allocate a VGPR at the end of the argument list, or if no argument
+// VGPRs are left allocating a stack slot.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+ ArrayRef<MCPhysReg> ArgVGPRs
+ = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
+ unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
+ if (RegIdx == ArgVGPRs.size()) {
+ // Spill to stack required.
+ int64_t Offset = CCInfo.AllocateStack(4, 4);
+
+ return ArgDescriptor::createStack(Offset);
}
+
+ unsigned Reg = ArgVGPRs[RegIdx];
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ return ArgDescriptor::createRegister(Reg);
+}
+
+static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
+ const TargetRegisterClass *RC,
+ unsigned NumArgRegs) {
+ ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
+ unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
+ if (RegIdx == ArgSGPRs.size())
+ report_fatal_error("ran out of SGPRs for arguments");
+
+ unsigned Reg = ArgSGPRs[RegIdx];
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, RC);
+ return ArgDescriptor::createRegister(Reg);
+}
+
+static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
+ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+}
+
+static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
+ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
+}
+
+static void allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ if (Info.hasWorkItemIDX())
+ Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+
+ if (Info.hasWorkItemIDY())
+ Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+
+ if (Info.hasWorkItemIDZ())
+ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+}
+
+static void allocateSpecialInputSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ auto &ArgInfo = Info.getArgInfo();
+
+ // TODO: Unify handling with private memory pointers.
+
+ if (Info.hasDispatchPtr())
+ ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasQueuePtr())
+ ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasKernargSegmentPtr())
+ ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
+
+ if (Info.hasDispatchID())
+ ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
+
+ // flat_scratch_init is not applicable for non-kernel functions.
+
+ if (Info.hasWorkGroupIDX())
+ ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasWorkGroupIDY())
+ ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
+
+ if (Info.hasWorkGroupIDZ())
+ ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
}
// Allocate special inputs passed in user SGPRs.
@@ -1212,8 +1331,8 @@ static void reservePrivateMemoryRegs(con
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
- unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
if (MFI.hasCalls()) {
@@ -1229,8 +1348,8 @@ static void reservePrivateMemoryRegs(con
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
} else {
- unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
}
} else {
@@ -1256,8 +1375,8 @@ static void reservePrivateMemoryRegs(con
Info.setScratchRSrcReg(ReservedBufferReg);
if (HasStackObjects && !MFI.hasCalls()) {
- unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
} else {
unsigned ReservedOffsetReg
@@ -1390,7 +1509,7 @@ SDValue SITargetLowering::LowerFormalArg
}
if (IsEntryFunc) {
- allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -1509,6 +1628,11 @@ SDValue SITargetLowering::LowerFormalArg
InVals.push_back(Val);
}
+ if (!IsEntryFunc) {
+ // Special inputs come after user arguments.
+ allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ }
+
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
@@ -1516,8 +1640,13 @@ SDValue SITargetLowering::LowerFormalArg
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
+ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
+
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
@@ -1741,6 +1870,81 @@ SDValue SITargetLowering::LowerCallResul
return Chain;
}
+// Add code to pass special inputs required depending on used features separate
+// from the explicit user arguments present in the IR.
+void SITargetLowering::passSpecialInputs(
+ CallLoweringInfo &CLI,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ SDValue Chain,
+ SDValue StackPtr) const {
+ // If we don't have a call site, this was a call inserted by
+ // legalization. These can never use special inputs.
+ if (!CLI.CS)
+ return;
+
+ const Function *CalleeFunc = CLI.CS.getCalledFunction();
+ if (!CalleeFunc)
+ report_fatal_error("indirect calls not handled");
+
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+
+ const SISubtarget *ST = getSubtarget();
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ const AMDGPUFunctionArgInfo &CalleeArgInfo
+ = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+
+ const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
+
+ // TODO: Unify with private memory register handling. This is complicated by
+ // the fact that at least in kernels, the input argument is not necessarily
+ // in the same location as the input.
+ AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
+ AMDGPUFunctionArgInfo::DISPATCH_PTR,
+ AMDGPUFunctionArgInfo::QUEUE_PTR,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
+ AMDGPUFunctionArgInfo::DISPATCH_ID,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_X,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Z
+ };
+
+ for (auto InputID : InputRegs) {
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+
+ std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
+ if (!OutgoingArg)
+ continue;
+
+ const ArgDescriptor *IncomingArg;
+ const TargetRegisterClass *IncomingArgRC;
+ std::tie(IncomingArg, IncomingArgRC)
+ = CallerArgInfo.getPreloadedValue(InputID);
+ assert(IncomingArgRC == ArgRC);
+
+ // All special arguments are ints for now.
+ EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
+ SDValue InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
+ if (OutgoingArg->isRegister()) {
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ } else {
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
+ InputReg,
+ OutgoingArg->getStackOffset());
+ MemOpChains.push_back(ArgStore);
+ }
+ }
+}
+
// The wave scratch offset register is used as the global base pointer.
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@@ -1897,6 +2101,9 @@ SDValue SITargetLowering::LowerCall(Call
}
}
+ // Copy special input registers after user input arguments.
+ passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@@ -3424,7 +3631,6 @@ SDValue SITargetLowering::LowerINTRINSIC
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -3436,10 +3642,8 @@ SDValue SITargetLowering::LowerINTRINSIC
case Intrinsic::amdgcn_implicit_buffer_ptr: {
if (getSubtarget()->isAmdCodeObjectV2(MF))
return emitNonHSAIntrinsicError(DAG, DL, VT);
-
- unsigned Reg = TRI->getPreloadedValue(MF,
- SIRegisterInfo::IMPLICIT_BUFFER_PTR);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
@@ -3451,10 +3655,9 @@ SDValue SITargetLowering::LowerINTRINSIC
return DAG.getUNDEF(VT);
}
- auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
- SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
- TRI->getPreloadedValue(MF, Reg), VT);
+ auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+ AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
+ return getPreloadedValue(DAG, *MFI, VT, RegID);
}
case Intrinsic::amdgcn_implicitarg_ptr: {
if (MFI->isEntryFunction())
@@ -3462,13 +3665,11 @@ SDValue SITargetLowering::LowerINTRINSIC
report_fatal_error("amdgcn.implicitarg.ptr not implemented for functions");
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
- unsigned Reg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
case Intrinsic::amdgcn_dispatch_id: {
- unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
}
case Intrinsic::amdgcn_rcp:
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
@@ -3553,28 +3754,32 @@ SDValue SITargetLowering::LowerINTRINSIC
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
- case Intrinsic::amdgcn_workitem_id_x:
+ return getPreloadedValue(DAG, *MFI, VT,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_workitem_id_x: {
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDX);
+ }
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
+ return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()),
+ MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Thu Aug 3 16:00:29 2017
@@ -16,6 +16,7 @@
#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
#include "AMDGPUISelLowering.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "SIInstrInfo.h"
namespace llvm {
@@ -32,6 +33,10 @@ class SITargetLowering final : public AM
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
const ISD::InputArg &Arg) const;
+ SDValue getPreloadedValue(SelectionDAG &DAG,
+ const SIMachineFunctionInfo &MFI,
+ EVT VT,
+ AMDGPUFunctionArgInfo::PreloadedValue) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
@@ -205,6 +210,14 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ void passSpecialInputs(
+ CallLoweringInfo &CLI,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ SDValue Chain,
+ SDValue StackPtr) const;
+
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Thu Aug 3 16:00:29 2017
@@ -916,7 +916,6 @@ unsigned SIInstrInfo::calculateLDSSpillA
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@@ -936,13 +935,13 @@ unsigned SIInstrInfo::calculateLDSSpillA
WorkGroupSize > WavefrontSize) {
unsigned TIDIGXReg
- = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
unsigned TIDIGYReg
- = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
unsigned TIDIGZReg
- = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
+ = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
unsigned InputPtrReg =
- TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
Entry.addLiveIn(Reg);
Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp Thu Aug 3 16:00:29 2017
@@ -27,24 +27,7 @@ SIMachineFunctionInfo::SIMachineFunction
ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
FrameOffsetReg(AMDGPU::FP_REG),
StackPtrOffsetReg(AMDGPU::SP_REG),
- PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
- DispatchPtrUserSGPR(AMDGPU::NoRegister),
- QueuePtrUserSGPR(AMDGPU::NoRegister),
- KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
- DispatchIDUserSGPR(AMDGPU::NoRegister),
- FlatScratchInitUserSGPR(AMDGPU::NoRegister),
- PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
- GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
- GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
- GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
- WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
- WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
- WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
- WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
- PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
- WorkItemIDXVGPR(AMDGPU::NoRegister),
- WorkItemIDYVGPR(AMDGPU::NoRegister),
- WorkItemIDZVGPR(AMDGPU::NoRegister),
+ ArgInfo(),
PSInputAddr(0),
PSInputEnable(0),
ReturnsVoid(true),
@@ -91,8 +74,11 @@ SIMachineFunctionInfo::SIMachineFunction
FrameOffsetReg = AMDGPU::SGPR5;
StackPtrOffsetReg = AMDGPU::SGPR32;
- // FIXME: Not really a system SGPR.
- PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(ScratchRSrcReg);
+ ArgInfo.PrivateSegmentWaveByteOffset =
+ ArgDescriptor::createRegister(ScratchWaveOffsetReg);
+
if (F->hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
} else {
@@ -151,10 +137,11 @@ SIMachineFunctionInfo::SIMachineFunction
if (HasStackObjects || MaySpill) {
PrivateSegmentWaveByteOffset = true;
- // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
- (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+ // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+ (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+ ArgInfo.PrivateSegmentWaveByteOffset
+ = ArgDescriptor::createRegister(AMDGPU::SGPR5);
}
}
@@ -189,52 +176,54 @@ SIMachineFunctionInfo::SIMachineFunction
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
- PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
NumUserSGPRs += 4;
- return PrivateSegmentBufferUserSGPR;
+ return ArgInfo.PrivateSegmentBuffer.getRegister();
}
unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
- DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return DispatchPtrUserSGPR;
+ return ArgInfo.DispatchPtr.getRegister();
}
unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
- QueuePtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return QueuePtrUserSGPR;
+ return ArgInfo.QueuePtr.getRegister();
}
unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
- KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.KernargSegmentPtr
+ = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return KernargSegmentPtrUserSGPR;
+ return ArgInfo.KernargSegmentPtr.getRegister();
}
unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
- DispatchIDUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return DispatchIDUserSGPR;
+ return ArgInfo.DispatchID.getRegister();
}
unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
- FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return FlatScratchInitUserSGPR;
+ return ArgInfo.FlatScratchInit.getRegister();
}
unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
- ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
- return ImplicitBufferPtrUserSGPR;
+ return ArgInfo.ImplicitBufferPtr.getRegister();
}
static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h Thu Aug 3 16:00:29 2017
@@ -16,6 +16,7 @@
#include "AMDGPUMachineFunction.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "AMDGPUArgumentUsageInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -96,33 +97,7 @@ class SIMachineFunctionInfo final : publ
// Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
unsigned StackPtrOffsetReg;
- // Input registers for non-HSA ABI
- unsigned ImplicitBufferPtrUserSGPR;
-
- // Input registers setup for the HSA ABI.
- // User SGPRs in allocation order.
- unsigned PrivateSegmentBufferUserSGPR;
- unsigned DispatchPtrUserSGPR;
- unsigned QueuePtrUserSGPR;
- unsigned KernargSegmentPtrUserSGPR;
- unsigned DispatchIDUserSGPR;
- unsigned FlatScratchInitUserSGPR;
- unsigned PrivateSegmentSizeUserSGPR;
- unsigned GridWorkGroupCountXUserSGPR;
- unsigned GridWorkGroupCountYUserSGPR;
- unsigned GridWorkGroupCountZUserSGPR;
-
- // System SGPRs in allocation order.
- unsigned WorkGroupIDXSystemSGPR;
- unsigned WorkGroupIDYSystemSGPR;
- unsigned WorkGroupIDZSystemSGPR;
- unsigned WorkGroupInfoSystemSGPR;
- unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
-
- // VGPR inputs. These are always v0, v1 and v2 for entry functions.
- unsigned WorkItemIDXVGPR;
- unsigned WorkItemIDYVGPR;
- unsigned WorkItemIDZVGPR;
+ AMDGPUFunctionArgInfo ArgInfo;
// Graphics info.
unsigned PSInputAddr;
@@ -235,7 +210,6 @@ private:
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
public:
-
SIMachineFunctionInfo(const MachineFunction &MF);
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
@@ -266,37 +240,52 @@ public:
// Add system SGPRs.
unsigned addWorkGroupIDX() {
- WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupIDXSystemSGPR;
+ return ArgInfo.WorkGroupIDX.getRegister();
}
unsigned addWorkGroupIDY() {
- WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupIDYSystemSGPR;
+ return ArgInfo.WorkGroupIDY.getRegister();
}
unsigned addWorkGroupIDZ() {
- WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupIDZSystemSGPR;
+ return ArgInfo.WorkGroupIDZ.getRegister();
}
unsigned addWorkGroupInfo() {
- WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+ ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return WorkGroupInfoSystemSGPR;
+ return ArgInfo.WorkGroupInfo.getRegister();
}
+ // Add special VGPR inputs
+ void setWorkItemIDX(ArgDescriptor Arg) {
+ ArgInfo.WorkItemIDX = Arg;
+ }
+
+ void setWorkItemIDY(ArgDescriptor Arg) {
+ ArgInfo.WorkItemIDY = Arg;
+ }
+
+ void setWorkItemIDZ(ArgDescriptor Arg) {
+ ArgInfo.WorkItemIDZ = Arg;
+ }
+
+
unsigned addPrivateSegmentWaveByteOffset() {
- PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+ ArgInfo.PrivateSegmentWaveByteOffset
+ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
- return PrivateSegmentWaveByteOffsetSystemSGPR;
+ return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
void setPrivateSegmentWaveByteOffset(unsigned Reg) {
- PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+ ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
}
bool hasPrivateSegmentBuffer() const {
@@ -375,6 +364,23 @@ public:
return ImplicitBufferPtr;
}
+ AMDGPUFunctionArgInfo &getArgInfo() {
+ return ArgInfo;
+ }
+
+ const AMDGPUFunctionArgInfo &getArgInfo() const {
+ return ArgInfo;
+ }
+
+ std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+ getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ return ArgInfo.getPreloadedValue(Value);
+ }
+
+ unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ return ArgInfo.getPreloadedValue(Value).first->getRegister();
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -384,7 +390,7 @@ public:
}
unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
- return PrivateSegmentWaveByteOffsetSystemSGPR;
+ return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
/// \brief Returns the physical register reserved for use as the resource
@@ -426,11 +432,11 @@ public:
}
unsigned getQueuePtrUserSGPR() const {
- return QueuePtrUserSGPR;
+ return ArgInfo.QueuePtr.getRegister();
}
unsigned getImplicitBufferPtrUserSGPR() const {
- return ImplicitBufferPtrUserSGPR;
+ return ArgInfo.ImplicitBufferPtr.getRegister();
}
bool hasSpilledSGPRs() const {
@@ -562,13 +568,13 @@ public:
switch (Dim) {
case 0:
assert(hasWorkGroupIDX());
- return WorkGroupIDXSystemSGPR;
+ return ArgInfo.WorkGroupIDX.getRegister();
case 1:
assert(hasWorkGroupIDY());
- return WorkGroupIDYSystemSGPR;
+ return ArgInfo.WorkGroupIDY.getRegister();
case 2:
assert(hasWorkGroupIDZ());
- return WorkGroupIDZSystemSGPR;
+ return ArgInfo.WorkGroupIDZ.getRegister();
}
llvm_unreachable("unexpected dimension");
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Thu Aug 3 16:00:29 2017
@@ -1338,61 +1338,6 @@ bool SIRegisterInfo::shouldRewriteCopySr
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
-// FIXME: Most of these are flexible with HSA and we don't need to reserve them
-// as input registers if unused. Whether the dispatch ptr is necessary should be
-// easy to detect from used intrinsics. Scratch setup is harder to know.
-unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
- enum PreloadedValue Value) const {
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- (void)ST;
- switch (Value) {
- case SIRegisterInfo::WORKGROUP_ID_X:
- assert(MFI->hasWorkGroupIDX());
- return MFI->WorkGroupIDXSystemSGPR;
- case SIRegisterInfo::WORKGROUP_ID_Y:
- assert(MFI->hasWorkGroupIDY());
- return MFI->WorkGroupIDYSystemSGPR;
- case SIRegisterInfo::WORKGROUP_ID_Z:
- assert(MFI->hasWorkGroupIDZ());
- return MFI->WorkGroupIDZSystemSGPR;
- case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
- return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
- case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- assert(MFI->hasPrivateSegmentBuffer());
- return MFI->PrivateSegmentBufferUserSGPR;
- case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
- assert(MFI->hasImplicitBufferPtr());
- return MFI->ImplicitBufferPtrUserSGPR;
- case SIRegisterInfo::KERNARG_SEGMENT_PTR:
- assert(MFI->hasKernargSegmentPtr());
- return MFI->KernargSegmentPtrUserSGPR;
- case SIRegisterInfo::DISPATCH_ID:
- assert(MFI->hasDispatchID());
- return MFI->DispatchIDUserSGPR;
- case SIRegisterInfo::FLAT_SCRATCH_INIT:
- assert(MFI->hasFlatScratchInit());
- return MFI->FlatScratchInitUserSGPR;
- case SIRegisterInfo::DISPATCH_PTR:
- assert(MFI->hasDispatchPtr());
- return MFI->DispatchPtrUserSGPR;
- case SIRegisterInfo::QUEUE_PTR:
- assert(MFI->hasQueuePtr());
- return MFI->QueuePtrUserSGPR;
- case SIRegisterInfo::WORKITEM_ID_X:
- assert(MFI->hasWorkItemIDX());
- return AMDGPU::VGPR0;
- case SIRegisterInfo::WORKITEM_ID_Y:
- assert(MFI->hasWorkItemIDY());
- return AMDGPU::VGPR1;
- case SIRegisterInfo::WORKITEM_ID_Z:
- assert(MFI->hasWorkItemIDZ());
- return AMDGPU::VGPR2;
- }
- llvm_unreachable("unexpected preloaded value type");
-}
-
/// \brief Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h?rev=309998&r1=309997&r2=309998&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h Thu Aug 3 16:00:29 2017
@@ -186,31 +186,6 @@ public:
OpType <= AMDGPU::OPERAND_SRC_LAST;
}
- enum PreloadedValue {
- // SGPRS:
- PRIVATE_SEGMENT_BUFFER = 0,
- DISPATCH_PTR = 1,
- QUEUE_PTR = 2,
- KERNARG_SEGMENT_PTR = 3,
- DISPATCH_ID = 4,
- FLAT_SCRATCH_INIT = 5,
- WORKGROUP_ID_X = 10,
- WORKGROUP_ID_Y = 11,
- WORKGROUP_ID_Z = 12,
- PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
- IMPLICIT_BUFFER_PTR = 15,
-
- // VGPRS:
- FIRST_VGPR_VALUE = 16,
- WORKITEM_ID_X = FIRST_VGPR_VALUE,
- WORKITEM_ID_Y = 17,
- WORKITEM_ID_Z = 18
- };
-
- /// \brief Returns the physical register that \p Value is stored in.
- unsigned getPreloadedValue(const MachineFunction &MF,
- enum PreloadedValue Value) const;
-
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,
const MachineFunction &MF) const;
Added: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll?rev=309998&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll Thu Aug 3 16:00:29 2017
@@ -0,0 +1,612 @@
+; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}use_dispatch_ptr:
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+define void @use_dispatch_ptr() #1 {
+ %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+ %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
+ %value = load volatile i32, i32 addrspace(2)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr:
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: s_mov_b64 s[6:7], s[4:5]
+define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
+ call void @use_dispatch_ptr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_queue_ptr:
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+define void @use_queue_ptr() #1 {
+ %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+ %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
+ %value = load volatile i32, i32 addrspace(2)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
+; GCN: enable_sgpr_queue_ptr = 1
+; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
+ call void @use_queue_ptr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
+; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10
+; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
+
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
+define void @use_queue_ptr_addrspacecast() #1 {
+ %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %asc
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
+; CIVI: enable_sgpr_queue_ptr = 1
+
+; CIVI: s_mov_b64 s[6:7], s[4:5]
+; GFX9-NOT: s_mov_b64
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
+ call void @use_queue_ptr_addrspacecast()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+define void @use_kernarg_segment_ptr() #1 {
+ %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+ %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
+ %value = load volatile i32, i32 addrspace(2)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
+; GCN: enable_sgpr_kernarg_segment_ptr = 1
+; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
+ call void @use_kernarg_segment_ptr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_dispatch_id:
+; GCN: ; use s[6:7]
+define void @use_dispatch_id() #1 {
+ %id = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %id)
+ ret void
+}
+
+; No kernarg segment so that there is a mov to check. With kernarg
+; pointer enabled, it happens to end up in the right place anyway.
+
+; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id:
+; GCN: enable_sgpr_dispatch_id = 1
+
+; GCN: s_mov_b64 s[6:7], s[4:5]
+define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 {
+ call void @use_dispatch_id()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_x:
+; GCN: s_waitcnt
+; GCN: ; use s6
+define void @use_workgroup_id_x() #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_stack_workgroup_id_x:
+; GCN: s_waitcnt
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4
+; GCN: ; use s6
+; GCN: s_setpc_b64
+define void @use_stack_workgroup_id_x() #1 {
+ %alloca = alloca i32
+ store volatile i32 0, i32* %alloca
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_y:
+; GCN: s_waitcnt
+; GCN: ; use s6
+define void @use_workgroup_id_y() #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_z:
+; GCN: s_waitcnt
+; GCN: ; use s6
+define void @use_workgroup_id_z() #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_xy:
+; GCN: ; use s6
+; GCN: ; use s7
+define void @use_workgroup_id_xy() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_xyz:
+; GCN: ; use s6
+; GCN: ; use s7
+; GCN: ; use s8
+define void @use_workgroup_id_xyz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ call void asm sideeffect "; use $0", "s"(i32 %val2)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_xz:
+; GCN: ; use s6
+; GCN: ; use s7
+define void @use_workgroup_id_xz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_yz:
+; GCN: ; use s6
+; GCN: ; use s7
+define void @use_workgroup_id_yz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 0
+; GCN: enable_sgpr_workgroup_id_z = 0
+
+; GCN-NOT: s6
+; GCN: s_mov_b32 s33, s7
+; GCN-NOT: s6
+; GCN: s_mov_b32 s4, s33
+; GCN-NOT: s6
+; GCN: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
+ call void @use_workgroup_id_x()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 1
+; GCN: enable_sgpr_workgroup_id_z = 0
+
+; GCN: s_mov_b32 s33, s8
+; GCN: s_mov_b32 s4, s33
+; GCN: s_mov_b32 s6, s7
+; GCN: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
+ call void @use_workgroup_id_y()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 0
+; GCN: enable_sgpr_workgroup_id_z = 1
+
+; GCN: s_mov_b32 s33, s8
+; GCN: s_mov_b32 s4, s33
+; GCN: s_mov_b32 s6, s7
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
+ call void @use_workgroup_id_z()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 1
+; GCN: enable_sgpr_workgroup_id_z = 0
+
+; GCN: s_mov_b32 s33, s8
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN: s_mov_b32 s4, s33
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN: s_mov_b32 s32, s33
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
+ call void @use_workgroup_id_xy()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 1
+; GCN: enable_sgpr_workgroup_id_z = 1
+
+; GCN: s_mov_b32 s33, s9
+
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN-NOT: s8
+
+; GCN: s_mov_b32 s4, s33
+
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN-NOT: s8
+
+; GCN: s_mov_b32 s32, s33
+
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN-NOT: s8
+
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
+ call void @use_workgroup_id_xyz()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 0
+; GCN: enable_sgpr_workgroup_id_z = 1
+
+; GCN: s_mov_b32 s33, s8
+; GCN-NOT: s6
+; GCN-NOT: s7
+
+; GCN: s_mov_b32 s4, s33
+; GCN-NOT: s6
+; GCN-NOT: s7
+
+; GCN: s_mov_b32 s32, s33
+; GCN-NOT: s6
+; GCN-NOT: s7
+
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
+ call void @use_workgroup_id_xz()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 1
+; GCN: enable_sgpr_workgroup_id_z = 1
+
+; GCN: s_mov_b32 s33, s9
+; GCN: s_mov_b32 s6, s7
+; GCN: s_mov_b32 s4, s33
+; GCN: s_mov_b32 s7, s8
+; GCN: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
+ call void @use_workgroup_id_yz()
+ ret void
+}
+
+; Argument is in right place already
+; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x:
+; GCN-NOT: s6
+define void @func_indirect_use_workgroup_id_x() #1 {
+ call void @use_workgroup_id_x()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
+; GCN-NOT: s6
+define void @func_indirect_use_workgroup_id_y() #1 {
+ call void @use_workgroup_id_y()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
+; GCN-NOT: s6
+define void @func_indirect_use_workgroup_id_z() #1 {
+ call void @use_workgroup_id_z()
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x:
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN: ; use s6
+define void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y:
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN: ; use s6
+define void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z:
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN: ; use s6
+define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_x:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 0
+; GCN: enable_sgpr_workgroup_id_z = 0
+
+; GCN-DAG: s_mov_b32 s33, s7
+; GCN-DAG: v_mov_b32_e32 v0, 0x22b
+
+; GCN-NOT: s6
+; GCN: s_mov_b32 s4, s33
+; GCN-NOT: s6
+; GCN-DAG: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
+ call void @other_arg_use_workgroup_id_x(i32 555)
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 1
+; GCN: enable_sgpr_workgroup_id_z = 0
+
+; GCN-DAG: s_mov_b32 s33, s8
+; GCN-DAG: v_mov_b32_e32 v0, 0x22b
+; GCN: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s6, s7
+; GCN-DAG: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
+ call void @other_arg_use_workgroup_id_y(i32 555)
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 0
+; GCN: enable_sgpr_workgroup_id_z = 1
+
+; GCN: s_mov_b32 s33, s8
+; GCN-DAG: v_mov_b32_e32 v0, 0x22b
+; GCN: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s6, s7
+
+; GCN: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
+ call void @other_arg_use_workgroup_id_z(i32 555)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_every_sgpr_input:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
+; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0
+; GCN: ; use s[12:13]
+; GCN: ; use s14
+; GCN: ; use s15
+; GCN: ; use s16
+define void @use_every_sgpr_input() #1 {
+ %alloca = alloca i32, align 4
+ store volatile i32 0, i32* %alloca
+
+ %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+ %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
+ %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
+
+ %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+ %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
+ %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
+
+ %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+ %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
+ %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
+
+ %val3 = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %val3)
+
+ %val4 = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val4)
+
+ %val5 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val5)
+
+ %val6 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val6)
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
+; GCN: enable_sgpr_workgroup_id_x = 1
+; GCN: enable_sgpr_workgroup_id_y = 1
+; GCN: enable_sgpr_workgroup_id_z = 1
+; GCN: enable_sgpr_workgroup_info = 0
+
+; GCN: enable_sgpr_private_segment_buffer = 1
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: enable_sgpr_queue_ptr = 1
+; GCN: enable_sgpr_kernarg_segment_ptr = 1
+; GCN: enable_sgpr_dispatch_id = 1
+; GCN: enable_sgpr_flat_scratch_init = 1
+
+; GCN: s_mov_b32 s33, s17
+; GCN: s_mov_b64 s[12:13], s[10:11]
+; GCN: s_mov_b64 s[10:11], s[8:9]
+; GCN: s_mov_b64 s[8:9], s[6:7]
+; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN: s_mov_b32 s4, s33
+; GCN: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
+ call void @use_every_sgpr_input()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input:
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN-NOT: s8
+; GCN-NOT: s9
+; GCN-NOT: s10
+; GCN-NOT: s11
+; GCN-NOT: s12
+; GCN-NOT: s13
+; GCN-NOT: s[6:7]
+; GCN-NOT: s[8:9]
+; GCN-NOT: s[10:11]
+; GCN-NOT: s[12:13]
+define void @func_indirect_use_every_sgpr_input() #1 {
+ call void @use_every_sgpr_input()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz:
+; GCN-DAG: s_mov_b32 s6, s14
+; GCN-DAG: s_mov_b32 s7, s15
+; GCN-DAG: s_mov_b32 s8, s16
+; GCN: s_swappc_b64
+define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
+ %alloca = alloca i32, align 4
+ store volatile i32 0, i32* %alloca
+
+ %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+ %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
+ %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
+
+ %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+ %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
+ %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
+
+ %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+ %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
+ %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
+
+ %val3 = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %val3)
+
+ %val4 = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val4)
+
+ %val5 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val5)
+
+ %val6 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val6)
+
+ call void @use_workgroup_id_xyz()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
+; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0x300
+
+; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-9]+]], s14
+; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-9]+]], s15
+; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-9]+]], s16
+; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7]
+; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9]
+; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11]
+
+; GCN-DAG: s_mov_b32 s6, [[SAVE_X]]
+; GCN-DAG: s_mov_b32 s7, [[SAVE_Y]]
+; GCN-DAG: s_mov_b32 s8, [[SAVE_Z]]
+; GCN: s_swappc_b64
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
+; GCN: s_load_dword s{{[0-9]+}},
+; GCN: s_load_dword s{{[0-9]+}},
+; GCN: s_load_dword s{{[0-9]+}},
+; GCN: ; use
+; GCN: ; use [[SAVE_X]]
+; GCN: ; use [[SAVE_Y]]
+; GCN: ; use [[SAVE_Z]]
+define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 {
+ %alloca = alloca i32, align 4
+ call void @use_workgroup_id_xyz()
+
+ store volatile i32 0, i32* %alloca
+
+ %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+ %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
+ %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
+
+ %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+ %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
+ %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
+
+ %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+ %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
+ %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
+
+ %val3 = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %val3)
+
+ %val4 = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val4)
+
+ %val5 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val5)
+
+ %val6 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val6)
+
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i64 @llvm.amdgcn.dispatch.id() #0
+declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind noinline }
Added: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll?rev=309998&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll Thu Aug 3 16:00:29 2017
@@ -0,0 +1,671 @@
+; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}use_workitem_id_x:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_x() #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_y:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_y() #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.y()
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_z:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_z() #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_xy:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_xy() #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+ store volatile i32 %val0, i32 addrspace(1)* undef
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_xyz:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_xyz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+ %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* undef
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ store volatile i32 %val2, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_xz:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_xz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* undef
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_yz:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @use_workitem_id_yz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.y()
+ %val1 = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* undef
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
+; GCN: enable_vgpr_workitem_id = 0
+
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
+ call void @use_workitem_id_x()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
+; GCN: enable_vgpr_workitem_id = 1
+
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_mov_b32_e32 v0, v1
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
+ call void @use_workitem_id_y()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
+; GCN: enable_vgpr_workitem_id = 2
+
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: v_mov_b32_e32 v0, v2
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
+ call void @use_workitem_id_z()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define void @func_indirect_use_workitem_id_x() #1 {
+ call void @use_workitem_id_x()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y:
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define void @func_indirect_use_workitem_id_y() #1 {
+ call void @use_workitem_id_y()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z:
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define void @func_indirect_use_workitem_id_z() #1 {
+ call void @use_workitem_id_z()
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.y()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
+; GCN: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
+; GCN: enable_vgpr_workitem_id = 0
+
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v0, 0x22b
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
+ call void @other_arg_use_workitem_id_x(i32 555)
+ ret void
+}
+
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
+; GCN: enable_vgpr_workitem_id = 1
+
+; GCN-NOT: v1
+; GCN: v_mov_b32_e32 v0, 0x22b
+; GCN-NOT: v1
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
+ call void @other_arg_use_workitem_id_y(i32 555)
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
+; GCN: enable_vgpr_workitem_id = 2
+
+; GCN: v_mov_b32_e32 v0, 0x22b
+; GCN: v_mov_b32_e32 v1, v2
+; GCN: s_swappc_b64
+; GCN-NOT: v0
+define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
+ call void @other_arg_use_workitem_id_z(i32 555)
+ ret void
+}
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @too_many_args_use_workitem_id_x(
+ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+ i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+ i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+ i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val, i32 addrspace(1)* undef
+
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %arg1, i32 addrspace(1)* undef
+ store volatile i32 %arg2, i32 addrspace(1)* undef
+ store volatile i32 %arg3, i32 addrspace(1)* undef
+ store volatile i32 %arg4, i32 addrspace(1)* undef
+ store volatile i32 %arg5, i32 addrspace(1)* undef
+ store volatile i32 %arg6, i32 addrspace(1)* undef
+ store volatile i32 %arg7, i32 addrspace(1)* undef
+
+ store volatile i32 %arg8, i32 addrspace(1)* undef
+ store volatile i32 %arg9, i32 addrspace(1)* undef
+ store volatile i32 %arg10, i32 addrspace(1)* undef
+ store volatile i32 %arg11, i32 addrspace(1)* undef
+ store volatile i32 %arg12, i32 addrspace(1)* undef
+ store volatile i32 %arg13, i32 addrspace(1)* undef
+ store volatile i32 %arg14, i32 addrspace(1)* undef
+ store volatile i32 %arg15, i32 addrspace(1)* undef
+
+ store volatile i32 %arg16, i32 addrspace(1)* undef
+ store volatile i32 %arg17, i32 addrspace(1)* undef
+ store volatile i32 %arg18, i32 addrspace(1)* undef
+ store volatile i32 %arg19, i32 addrspace(1)* undef
+ store volatile i32 %arg20, i32 addrspace(1)* undef
+ store volatile i32 %arg21, i32 addrspace(1)* undef
+ store volatile i32 %arg22, i32 addrspace(1)* undef
+ store volatile i32 %arg23, i32 addrspace(1)* undef
+
+ store volatile i32 %arg24, i32 addrspace(1)* undef
+ store volatile i32 %arg25, i32 addrspace(1)* undef
+ store volatile i32 %arg26, i32 addrspace(1)* undef
+ store volatile i32 %arg27, i32 addrspace(1)* undef
+ store volatile i32 %arg28, i32 addrspace(1)* undef
+ store volatile i32 %arg29, i32 addrspace(1)* undef
+ store volatile i32 %arg30, i32 addrspace(1)* undef
+ store volatile i32 %arg31, i32 addrspace(1)* undef
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
+; GCN: enable_vgpr_workitem_id = 0
+
+; GCN: s_mov_b32 s33, s7
+; GCN: s_mov_b32 s32, s33
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: s_mov_b32 s4, s33
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
+ call void @too_many_args_use_workitem_id_x(
+ i32 10, i32 20, i32 30, i32 40,
+ i32 50, i32 60, i32 70, i32 80,
+ i32 90, i32 100, i32 110, i32 120,
+ i32 130, i32 140, i32 150, i32 160,
+ i32 170, i32 180, i32 190, i32 200,
+ i32 210, i32 220, i32 230, i32 240,
+ i32 250, i32 260, i32 270, i32 280,
+ i32 290, i32 300, i32 310, i32 320)
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN: s_swappc_b64
+define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void @too_many_args_use_workitem_id_x(
+ i32 10, i32 20, i32 30, i32 40,
+ i32 50, i32 60, i32 70, i32 80,
+ i32 90, i32 100, i32 110, i32 120,
+ i32 130, i32 140, i32 150, i32 160,
+ i32 170, i32 180, i32 190, i32 200,
+ i32 210, i32 220, i32 230, i32 240,
+ i32 250, i32 260, i32 270, i32 280,
+ i32 290, i32 300, i32 310, i32 320)
+ ret void
+}
+
+; Requires loading and storing to stack slot.
+; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_add_u32 s32, s32, 0x400{{$}}
+
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
+
+; GCN: s_swappc_b64
+
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
+; GCN: s_sub_u32 s32, s32, 0x400{{$}}
+; GCN: s_setpc_b64
+define void @too_many_args_call_too_many_args_use_workitem_id_x(
+ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+ i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+ i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+ i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
+ call void @too_many_args_use_workitem_id_x(
+ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+ i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+ i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+ i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31)
+ ret void
+}
+
+; stack layout:
+; frame[0] = emergency stack slot
+; frame[1] = byval arg32
+; frame[2] = stack passed workitem ID x
+; frame[3] = VGPR spill slot
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
+; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
+; GCN: s_setpc_b64
+define void @too_many_args_use_workitem_id_x_byval(
+ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+ i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+ i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+ i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val, i32 addrspace(1)* undef
+
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %arg1, i32 addrspace(1)* undef
+ store volatile i32 %arg2, i32 addrspace(1)* undef
+ store volatile i32 %arg3, i32 addrspace(1)* undef
+ store volatile i32 %arg4, i32 addrspace(1)* undef
+ store volatile i32 %arg5, i32 addrspace(1)* undef
+ store volatile i32 %arg6, i32 addrspace(1)* undef
+ store volatile i32 %arg7, i32 addrspace(1)* undef
+
+ store volatile i32 %arg8, i32 addrspace(1)* undef
+ store volatile i32 %arg9, i32 addrspace(1)* undef
+ store volatile i32 %arg10, i32 addrspace(1)* undef
+ store volatile i32 %arg11, i32 addrspace(1)* undef
+ store volatile i32 %arg12, i32 addrspace(1)* undef
+ store volatile i32 %arg13, i32 addrspace(1)* undef
+ store volatile i32 %arg14, i32 addrspace(1)* undef
+ store volatile i32 %arg15, i32 addrspace(1)* undef
+
+ store volatile i32 %arg16, i32 addrspace(1)* undef
+ store volatile i32 %arg17, i32 addrspace(1)* undef
+ store volatile i32 %arg18, i32 addrspace(1)* undef
+ store volatile i32 %arg19, i32 addrspace(1)* undef
+ store volatile i32 %arg20, i32 addrspace(1)* undef
+ store volatile i32 %arg21, i32 addrspace(1)* undef
+ store volatile i32 %arg22, i32 addrspace(1)* undef
+ store volatile i32 %arg23, i32 addrspace(1)* undef
+
+ store volatile i32 %arg24, i32 addrspace(1)* undef
+ store volatile i32 %arg25, i32 addrspace(1)* undef
+ store volatile i32 %arg26, i32 addrspace(1)* undef
+ store volatile i32 %arg27, i32 addrspace(1)* undef
+ store volatile i32 %arg28, i32 addrspace(1)* undef
+ store volatile i32 %arg29, i32 addrspace(1)* undef
+ store volatile i32 %arg30, i32 addrspace(1)* undef
+ store volatile i32 %arg31, i32 addrspace(1)* undef
+ %private = load volatile i32, i32* %arg32
+ ret void
+}
+
+; frame[0] = emergency stack slot
+; frame[1] =
+
+; sp[0] = callee emergency stack slot reservation
+; sp[1] = byval
+; sp[2] = ??
+; sp[3] = stack passed workitem ID x
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
+; GCN: enable_vgpr_workitem_id = 0
+
+; GCN: s_mov_b32 s33, s7
+; GCN: s_add_u32 s32, s33, 0x200{{$}}
+
+; GCN-DAG: s_add_u32 s32, s32, 0x100{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
+
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
+ %alloca = alloca i32, align 4
+ store volatile i32 999, i32* %alloca
+ call void @too_many_args_use_workitem_id_x_byval(
+ i32 10, i32 20, i32 30, i32 40,
+ i32 50, i32 60, i32 70, i32 80,
+ i32 90, i32 100, i32 110, i32 120,
+ i32 130, i32 140, i32 150, i32 160,
+ i32 170, i32 180, i32 190, i32 200,
+ i32 210, i32 220, i32 230, i32 240,
+ i32 250, i32 260, i32 270, i32 280,
+ i32 290, i32 300, i32 310, i32 320,
+ i32* %alloca)
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
+
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; GCN: s_swappc_b64
+define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
+ %alloca = alloca i32, align 4
+ store volatile i32 999, i32* %alloca
+ call void @too_many_args_use_workitem_id_x_byval(
+ i32 10, i32 20, i32 30, i32 40,
+ i32 50, i32 60, i32 70, i32 80,
+ i32 90, i32 100, i32 110, i32 120,
+ i32 130, i32 140, i32 150, i32 160,
+ i32 170, i32 180, i32 190, i32 200,
+ i32 210, i32 220, i32 230, i32 240,
+ i32 250, i32 260, i32 270, i32 280,
+ i32 290, i32 300, i32 310, i32 320,
+ i32* %alloca)
+ ret void
+}
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @too_many_args_use_workitem_id_xyz(
+ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+ i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+ i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+ i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val0, i32 addrspace(1)* undef
+ %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val2, i32 addrspace(1)* undef
+
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %arg1, i32 addrspace(1)* undef
+ store volatile i32 %arg2, i32 addrspace(1)* undef
+ store volatile i32 %arg3, i32 addrspace(1)* undef
+ store volatile i32 %arg4, i32 addrspace(1)* undef
+ store volatile i32 %arg5, i32 addrspace(1)* undef
+ store volatile i32 %arg6, i32 addrspace(1)* undef
+ store volatile i32 %arg7, i32 addrspace(1)* undef
+
+ store volatile i32 %arg8, i32 addrspace(1)* undef
+ store volatile i32 %arg9, i32 addrspace(1)* undef
+ store volatile i32 %arg10, i32 addrspace(1)* undef
+ store volatile i32 %arg11, i32 addrspace(1)* undef
+ store volatile i32 %arg12, i32 addrspace(1)* undef
+ store volatile i32 %arg13, i32 addrspace(1)* undef
+ store volatile i32 %arg14, i32 addrspace(1)* undef
+ store volatile i32 %arg15, i32 addrspace(1)* undef
+
+ store volatile i32 %arg16, i32 addrspace(1)* undef
+ store volatile i32 %arg17, i32 addrspace(1)* undef
+ store volatile i32 %arg18, i32 addrspace(1)* undef
+ store volatile i32 %arg19, i32 addrspace(1)* undef
+ store volatile i32 %arg20, i32 addrspace(1)* undef
+ store volatile i32 %arg21, i32 addrspace(1)* undef
+ store volatile i32 %arg22, i32 addrspace(1)* undef
+ store volatile i32 %arg23, i32 addrspace(1)* undef
+
+ store volatile i32 %arg24, i32 addrspace(1)* undef
+ store volatile i32 %arg25, i32 addrspace(1)* undef
+ store volatile i32 %arg26, i32 addrspace(1)* undef
+ store volatile i32 %arg27, i32 addrspace(1)* undef
+ store volatile i32 %arg28, i32 addrspace(1)* undef
+ store volatile i32 %arg29, i32 addrspace(1)* undef
+ store volatile i32 %arg30, i32 addrspace(1)* undef
+ store volatile i32 %arg31, i32 addrspace(1)* undef
+
+ ret void
+}
+
+; frame[0] = kernel emergency stack slot
+; frame[1] = callee emergency stack slot
+; frame[2] = ID X
+; frame[3] = ID Y
+; frame[4] = ID Z
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
+; GCN: enable_vgpr_workitem_id = 2
+
+; GCN: s_mov_b32 s33, s7
+; GCN: s_mov_b32 s32, s33
+
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
+ call void @too_many_args_use_workitem_id_xyz(
+ i32 10, i32 20, i32 30, i32 40,
+ i32 50, i32 60, i32 70, i32 80,
+ i32 90, i32 100, i32 110, i32 120,
+ i32 130, i32 140, i32 150, i32 160,
+ i32 170, i32 180, i32 190, i32 200,
+ i32 210, i32 220, i32 230, i32 240,
+ i32 250, i32 260, i32 270, i32 280,
+ i32 290, i32 300, i32 310, i32 320)
+ ret void
+}
+
+; workitem ID X in register, yz on stack
+; v31 = workitem ID X
+; frame[0] = emergency slot
+; frame[1] = workitem Y
+; frame[2] = workitem Z
+
+; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
+; GCN: s_mov_b32 s5, s32
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
+; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
+; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
+
+; GCN: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+; GCN: ScratchSize: 12
+define void @too_many_args_use_workitem_id_x_stack_yz(
+ i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
+ i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
+ i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
+ i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val0, i32 addrspace(1)* undef
+ %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val2, i32 addrspace(1)* undef
+
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ store volatile i32 %arg1, i32 addrspace(1)* undef
+ store volatile i32 %arg2, i32 addrspace(1)* undef
+ store volatile i32 %arg3, i32 addrspace(1)* undef
+ store volatile i32 %arg4, i32 addrspace(1)* undef
+ store volatile i32 %arg5, i32 addrspace(1)* undef
+ store volatile i32 %arg6, i32 addrspace(1)* undef
+ store volatile i32 %arg7, i32 addrspace(1)* undef
+
+ store volatile i32 %arg8, i32 addrspace(1)* undef
+ store volatile i32 %arg9, i32 addrspace(1)* undef
+ store volatile i32 %arg10, i32 addrspace(1)* undef
+ store volatile i32 %arg11, i32 addrspace(1)* undef
+ store volatile i32 %arg12, i32 addrspace(1)* undef
+ store volatile i32 %arg13, i32 addrspace(1)* undef
+ store volatile i32 %arg14, i32 addrspace(1)* undef
+ store volatile i32 %arg15, i32 addrspace(1)* undef
+
+ store volatile i32 %arg16, i32 addrspace(1)* undef
+ store volatile i32 %arg17, i32 addrspace(1)* undef
+ store volatile i32 %arg18, i32 addrspace(1)* undef
+ store volatile i32 %arg19, i32 addrspace(1)* undef
+ store volatile i32 %arg20, i32 addrspace(1)* undef
+ store volatile i32 %arg21, i32 addrspace(1)* undef
+ store volatile i32 %arg22, i32 addrspace(1)* undef
+ store volatile i32 %arg23, i32 addrspace(1)* undef
+
+ store volatile i32 %arg24, i32 addrspace(1)* undef
+ store volatile i32 %arg25, i32 addrspace(1)* undef
+ store volatile i32 %arg26, i32 addrspace(1)* undef
+ store volatile i32 %arg27, i32 addrspace(1)* undef
+ store volatile i32 %arg28, i32 addrspace(1)* undef
+ store volatile i32 %arg29, i32 addrspace(1)* undef
+ store volatile i32 %arg30, i32 addrspace(1)* undef
+
+ ret void
+}
+
+; frame[0] = kernel emergency stack slot
+; frame[1] = callee emergency stack slot
+; frame[2] = ID Y
+; frame[3] = ID Z
+
+; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
+; GCN: enable_vgpr_workitem_id = 2
+
+; GCN: s_mov_b32 s33, s7
+; GCN: s_mov_b32 s32, s33
+
+; GCN-DAG: v_mov_b32_e32 v31, v0
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
+ call void @too_many_args_use_workitem_id_x_stack_yz(
+ i32 10, i32 20, i32 30, i32 40,
+ i32 50, i32 60, i32 70, i32 80,
+ i32 90, i32 100, i32 110, i32 120,
+ i32 130, i32 140, i32 150, i32 160,
+ i32 170, i32 180, i32 190, i32 200,
+ i32 210, i32 220, i32 230, i32 240,
+ i32 250, i32 260, i32 270, i32 280,
+ i32 290, i32 300, i32 310)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind noinline }
More information about the llvm-commits
mailing list