[llvm] r309732 - AMDGPU: Initial implementation of calls
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 1 12:54:18 PDT 2017
Author: arsenm
Date: Tue Aug 1 12:54:18 2017
New Revision: 309732
URL: http://llvm.org/viewvc/llvm-project?rev=309732&view=rev
Log:
AMDGPU: Initial implementation of calls
Includes a hack to fix the type selected for
the GlobalAddress of the function, which will be
fixed by changing the default datalayout to use
generic pointers for 0.
Added:
llvm/trunk/test/CodeGen/AMDGPU/basic-call-return.ll
llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll
llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll
llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUFrameLowering.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h
llvm/trunk/lib/Target/AMDGPU/R600FrameLowering.h
llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td Tue Aug 1 12:54:18 2017
@@ -163,6 +163,10 @@ def CC_AMDGPU : CallingConv<[
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
+ CCDelegateTo<CC_AMDGPU_Func>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() < "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_R600>>
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUFrameLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUFrameLowering.h?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUFrameLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUFrameLowering.h Tue Aug 1 12:54:18 2017
@@ -33,10 +33,6 @@ public:
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
unsigned getStackWidth(const MachineFunction &MF) const;
-
- bool hasFP(const MachineFunction &MF) const override {
- return false;
- }
};
} // end namespace llvm
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Tue Aug 1 12:54:18 2017
@@ -20,6 +20,7 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp Tue Aug 1 12:54:18 2017
@@ -30,7 +30,9 @@ using namespace llvm;
void AMDGPUInstrInfo::anchor() {}
AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
- : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ ST(ST),
+ AMDGPUASI(ST.getAMDGPUAS()) {}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td Tue Aug 1 12:54:18 2017
@@ -82,6 +82,22 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", A
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+def callseq_start : SDNode<"ISD::CALLSEQ_START",
+ SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOutGlue]
+>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
+>;
+
+def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]
+>;
+
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp Tue Aug 1 12:54:18 2017
@@ -121,6 +121,9 @@ bool AMDGPUMCInstLower::lowerOperand(con
MCOp = MCOperand::createExpr(Expr);
return true;
}
+ case MachineOperand::MO_RegisterMask:
+ // Regmasks are like implicit defs.
+ return false;
}
}
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp Tue Aug 1 12:54:18 2017
@@ -56,6 +56,20 @@ const MCPhysReg *SIRegisterInfo::getCall
}
}
+const MCPhysReg *
+SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+ // FIXME
+ static MCPhysReg Regs[2];
+
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ assert(!MFI->isEntryFunction());
+
+ Regs[0] = MFI->getFrameOffsetReg();
+ Regs[1] = AMDGPU::NoRegister;
+
+ return Regs;
+}
+
const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Tue Aug 1 12:54:18 2017
@@ -123,6 +123,12 @@ static cl::opt<bool> LateCFGStructurize(
cl::init(false),
cl::Hidden);
+static cl::opt<bool> EnableAMDGPUFunctionCalls(
+ "amdgpu-function-calls",
+ cl::Hidden,
+ cl::desc("Enable AMDGPU function call support"),
+ cl::init(false));
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -269,6 +275,11 @@ AMDGPUTargetMachine::AMDGPUTargetMachine
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+bool AMDGPUTargetMachine::enableFunctionCalls() const {
+ return EnableAMDGPUFunctionCalls &&
+ getTargetTriple().getArch() == Triple::amdgcn;
+}
+
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
return GPUAttr.hasAttribute(Attribute::None) ?
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h Tue Aug 1 12:54:18 2017
@@ -69,6 +69,9 @@ public:
return -1;
return 0;
}
+
+ LLVM_READONLY
+ bool enableFunctionCalls() const;
};
//===----------------------------------------------------------------------===//
Modified: llvm/trunk/lib/Target/AMDGPU/R600FrameLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/R600FrameLowering.h?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/R600FrameLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/R600FrameLowering.h Tue Aug 1 12:54:18 2017
@@ -27,6 +27,10 @@ public:
MachineBasicBlock &MBB) const override {}
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
+
+ bool hasFP(const MachineFunction &MF) const override {
+ return false;
+ }
};
} // end namespace llvm
Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Tue Aug 1 12:54:18 2017
@@ -575,6 +575,41 @@ void SIFrameLowering::processFunctionBef
}
}
+MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ int64_t Amount = I->getOperand(0).getImm();
+ if (Amount == 0)
+ return MBB.erase(I);
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const DebugLoc &DL = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (!TFI->hasReservedCallFrame(MF)) {
+ unsigned Align = getStackAlignment();
+
+ Amount = alignTo(Amount, Align);
+ assert(isUInt<32>(Amount) && "exceeded stack address space size");
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+
+ unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
+ BuildMI(MBB, I, DL, TII->get(Op), SPReg)
+ .addReg(SPReg)
+ .addImm(Amount * ST.getWavefrontSize());
+ } else if (CalleePopAmount != 0) {
+ llvm_unreachable("is this used?");
+ }
+
+ return MBB.erase(I);
+}
+
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h Tue Aug 1 12:54:18 2017
@@ -39,6 +39,11 @@ public:
MachineFunction &MF,
RegScavenger *RS = nullptr) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
private:
void emitFlatScratchInit(const SISubtarget &ST,
MachineFunction &MF,
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Tue Aug 1 12:54:18 2017
@@ -1201,9 +1201,13 @@ static void reservePrivateMemoryRegs(con
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
+ // For now assume stack access is needed in any callee functions, so we need
+ // the scratch registers to pass in.
+ bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
+
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.isAmdCodeObjectV2(MF)) {
- if (HasStackObjects) {
+ if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
@@ -1212,9 +1216,23 @@ static void reservePrivateMemoryRegs(con
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ if (MFI.hasCalls()) {
+ // If we have calls, we need to keep the frame register in a register
+ // that won't be clobbered by a call, so ensure it is copied somewhere.
+
+ // This is not a problem for the scratch wave offset, because the same
+ // registers are reserved in all functions.
+
+ // FIXME: Nothing is really ensuring this is a call preserved register,
+ // it's just selected from the end so it happens to be.
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ } else {
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ }
} else {
unsigned ReservedBufferReg
= TRI.reservedPrivateSegmentBufferReg(MF);
@@ -1237,7 +1255,7 @@ static void reservePrivateMemoryRegs(con
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
- if (HasStackObjects) {
+ if (HasStackObjects && !MFI.hasCalls()) {
unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
@@ -1249,6 +1267,50 @@ static void reservePrivateMemoryRegs(con
}
}
+bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
+}
+
+void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+
+}
+
+void SITargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_64RegClass;
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1589,6 +1651,22 @@ SITargetLowering::LowerReturn(SDValue Ch
}
// FIXME: Does sret work properly?
+ if (!Info->isEntryFunction()) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+ }
// Update chain and glue.
RetOps[0] = Chain;
@@ -1601,6 +1679,296 @@ SITargetLowering::LowerReturn(SDValue Ch
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
+SDValue SITargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+ SDValue Val;
+
+ if (VA.isRegLoc()) {
+ Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ } else if (VA.isMemLoc()) {
+ report_fatal_error("TODO: return values in memory");
+ } else
+ llvm_unreachable("unknown argument location type");
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+// The wave scratch offset register is used as the global base pointer.
+SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ const AMDGPUTargetMachine &TM =
+ static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+ if (!TM.enableFunctionCalls())
+ return AMDGPUTargetLowering::LowerCall(CLI, InVals);
+
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ bool IsSibCall = false;
+ bool IsThisReturn = false;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // TODO: Implement tail calls.
+ IsTailCall = false;
+
+ if (IsVarArg || MF.getTarget().Options.GuaranteedTailCallOpt) {
+ report_fatal_error("varargs and tail calls not implemented");
+ }
+
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // FIXME: Remove this hack for function pointer types.
+ const GlobalValue *GV = GA->getGlobal();
+ assert(Callee.getValueType() == MVT::i32);
+ Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
+ false, GA->getTargetFlags());
+ }
+
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int FPDiff = 0;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall) {
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
+
+ unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+
+ // TODO: Don't hardcode these registers and get from the callee function.
+ SDValue ScratchWaveOffsetReg
+ = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
+ }
+
+ // Stack pointer relative accesses are done by changing the offset SGPR. This
+ // is just the VGPR offset component.
+ SDValue StackPtr = DAG.getConstant(0, DL, MVT::i32);
+
+ SmallVector<SDValue, 8> MemOpChains;
+ MVT PtrVT = MVT::i32;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset;
+ SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
+ PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+
+ if (!IsTailCall) {
+ SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
+
+ DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+ DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ true,
+ /*isTailCall = */ false,
+ DstInfo, MachinePointerInfo());
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto &RegToPass : RegsToPass) {
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ llvm_unreachable("not implemented");
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
+ Chain = Call.getValue(0);
+ InFlag = Call.getValue(1);
+
+ uint64_t CalleePopBytes = 0;
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
+}
+
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -2266,6 +2634,27 @@ MachineBasicBlock *SITargetLowering::Emi
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::ADJCALLSTACKUP:
+ case AMDGPU::ADJCALLSTACKDOWN: {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ MachineInstrBuilder MIB(*MF, &MI);
+ MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ return BB;
+ }
+ case AMDGPU::SI_CALL: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@@ -2931,13 +3320,16 @@ SDValue SITargetLowering::LowerGlobalAdd
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
+ GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
+ // FIXME: It isn't correct to rely on the type of the pointer. This should
+ // be removed when address space 0 is 64-bit.
+ !GV->getType()->getElementType()->isFunctionTy())
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
- const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
if (shouldEmitFixup(GV))
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Tue Aug 1 12:54:18 2017
@@ -183,6 +183,12 @@ public:
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ bool supportSplitCSR(MachineFunction *MF) const override;
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -199,6 +205,15 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Tue Aug 1 12:54:18 2017
@@ -317,6 +317,45 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI
let DisableWQM = 1;
}
+// Return for returning function calls.
+def SI_RETURN : SPseudoInstSI <
+ (outs), (ins), [],
+ "; return"> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let SchedRW = [WriteBranch];
+}
+
+// Return for returning function calls.
+def SI_CALL : SPseudoInstSI <
+ (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)],
+ "; call $src0"> {
+ let Size = 4;
+ let isCall = 1;
+ let SchedRW = [WriteBranch];
+ let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKUP : SPseudoInstSI<
+ (outs), (ins i32imm:$amt0, i32imm:$amt1),
+ [(callseq_start timm:$amt0, timm:$amt1)],
+ "; adjcallstackup $amt0 $amt1"> {
+ let Size = 8; // Worst case. (s_add_u32 + constant)
+ let FixedSize = 1;
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKDOWN : SPseudoInstSI<
+ (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_end timm:$amt1, timm:$amt2)],
+ "; adjcallstackdown $amt1"> {
+ let Size = 8; // Worst case. (s_add_u32 + constant)
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Tue Aug 1 12:54:18 2017
@@ -236,8 +236,15 @@ bool SIRegisterInfo::requiresRegisterSca
return true;
}
-bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
- return MF.getFrameInfo().hasStackObjects();
+bool SIRegisterInfo::requiresFrameIndexScavenging(
+ const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MFI.hasStackObjects())
+ return true;
+
+ // May need to deal with callee saved registers.
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h Tue Aug 1 12:54:18 2017
@@ -63,6 +63,7 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td Tue Aug 1 12:54:18 2017
@@ -269,6 +269,18 @@ def VGPR_512 : RegisterTuples<[sub0, sub
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
+def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+ (add PRIVATE_RSRC_REG)> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
Added: llvm/trunk/test/CodeGen/AMDGPU/basic-call-return.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/basic-call-return.ll?rev=309732&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/basic-call-return.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/basic-call-return.ll Tue Aug 1 12:54:18 2017
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define void @void_func_void() #2 {
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void:
+define amdgpu_kernel void @test_call_void_func_void() {
+ call void @void_func_void()
+ ret void
+}
+
+define void @void_func_void_clobber_s40_s41() #2 {
+ call void asm sideeffect "", "~{SGPR40_SGPR41}"() #0
+ ret void
+}
+
+define amdgpu_kernel void @test_call_void_func_void_clobber_s40_s41() {
+ call void @void_func_void_clobber_s40_s41()
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
Added: llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll?rev=309732&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll Tue Aug 1 12:54:18 2017
@@ -0,0 +1,235 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+
+%struct.ByValStruct = type { [4 x i32] }
+
+; GCN-LABEL: {{^}}void_func_byval_struct:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:16{{$}}
+; GCN-NOT: s32
+define void @void_func_byval_struct(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
+entry:
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+ %tmp = load volatile i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %tmp, 1
+ store volatile i32 %add, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+ %tmp1 = load volatile i32, i32* %arrayidx2, align 4
+ %add3 = add nsw i32 %tmp1, 2
+ store volatile i32 %add3, i32* %arrayidx2, align 4
+ store volatile i32 9, i32 addrspace(1)* null, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32
+; GCN: v_writelane_b32
+
+; GCN-DAG: s_add_u32 s32, s32, 0x900{{$}}
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
+; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN: v_add_i32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
+
+; GCN: s_swappc_b64
+
+; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
+
+; GCN: v_readlane_b32
+; GCN: buffer_load_dword v32,
+; GCN: s_sub_u32 s32, s32, 0x900{{$}}
+; GCN: s_setpc_b64
+define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
+entry:
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+ %tmp = load volatile i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %tmp, 1
+ store volatile i32 %add, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+ %tmp1 = load volatile i32, i32* %arrayidx2, align 4
+ %add3 = add nsw i32 %tmp1, 2
+ call void @external_void_func_void()
+ store volatile i32 %add3, i32* %arrayidx2, align 4
+ store volatile i32 9, i32 addrspace(1)* null, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
+; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0xa00{{$}}
+; GCN: v_writelane_b32
+
+; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+
+; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
+; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
+
+; GCN-DAG: v_add_i32_e64 [[FI_ADD0:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 8,
+; GCN-DAG: v_or_b32_e32 [[FI_OR0:v[0-9]+]], 4, [[FI_ADD0]]
+
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:4
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:8
+
+; FIXME: or fails to combine with add, so FI doesn't fold and scratch wave offset is used
+; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
+; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
+
+; GCN-DAG: v_add_i32_e64 [[FI_ADD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 24,
+; GCN-DAG: v_or_b32_e32 [[FI_OR1:v[0-9]+]], 4, [[FI_ADD1]]
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+
+
+
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
+
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:4
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:8
+; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+
+
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
+
+; GCN: s_swappc_b64
+; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
+
+; GCN: v_readlane_b32
+
+; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @call_void_func_byval_struct_func() #0 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 4
+ %arg1 = alloca %struct.ByValStruct, align 4
+ %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
+ call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
+ call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32* %arrayidx2, align 4
+ call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
+ call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
+ call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
+ ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_add_u32 s32, s33, 0xa00{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
+; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
+
+; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
+
+; FIXME: Fold offset
+; GCN-DAG: v_or_b32_e32 [[OR_FI0:v[0-9]+]], 4,
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:4
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:8
+
+; FIXME: Fold offset
+; GCN-DAG: v_or_b32_e32 [[OR_FI1:v[0-9]+]], 4,
+
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
+
+
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:12
+; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
+
+
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:4
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:8
+; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+
+
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
+
+
+; GCN: s_swappc_b64
+; FIXME: Dead SP modfication
+; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 4
+ %arg1 = alloca %struct.ByValStruct, align 4
+ %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
+ call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
+ call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32* %arrayidx2, align 4
+ call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
+ call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
+ call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
+ ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
+define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 4
+ %arg1 = alloca %struct.ByValStruct, align 4
+ %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
+ call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
+ call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32* %arrayidx2, align 4
+ call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
+ call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
+ call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
+ ret void
+}
+
+declare void @external_void_func_void() #0
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3
+
+attributes #0 = { nounwind }
+attributes #1 = { noinline norecurse nounwind }
+attributes #2 = { nounwind norecurse "no-frame-pointer-elim"="true" }
Added: llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll?rev=309732&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll Tue Aug 1 12:54:18 2017
@@ -0,0 +1,527 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s
+
+declare void @external_void_func_i1(i1) #0
+declare void @external_void_func_i1_signext(i1 signext) #0
+declare void @external_void_func_i1_zeroext(i1 zeroext) #0
+
+declare void @external_void_func_i8(i8) #0
+declare void @external_void_func_i8_signext(i8 signext) #0
+declare void @external_void_func_i8_zeroext(i8 zeroext) #0
+
+declare void @external_void_func_i16(i16) #0
+declare void @external_void_func_i16_signext(i16 signext) #0
+declare void @external_void_func_i16_zeroext(i16 zeroext) #0
+
+declare void @external_void_func_i32(i32) #0
+declare void @external_void_func_i64(i64) #0
+
+declare void @external_void_func_f16(half) #0
+declare void @external_void_func_f32(float) #0
+declare void @external_void_func_f64(double) #0
+
+declare void @external_void_func_v2i16(<2 x i16>) #0
+declare void @external_void_func_v2f16(<2 x half>) #0
+
+declare void @external_void_func_v2i32(<2 x i32>) #0
+declare void @external_void_func_v3i32(<3 x i32>) #0
+declare void @external_void_func_v4i32(<4 x i32>) #0
+declare void @external_void_func_v8i32(<8 x i32>) #0
+declare void @external_void_func_v16i32(<16 x i32>) #0
+declare void @external_void_func_v32i32(<32 x i32>) #0
+declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0
+
+; return value and argument
+declare i32 @external_i32_func_i32(i32) #0
+
+; Structs
+declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0
+declare void @external_void_func_byval_struct_i8_i32({ i8, i32 }* byval) #0
+declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* sret, { i8, i32 }* byval) #0
+
+declare void @external_void_func_v16i8(<16 x i8>) #0
+
+
+; FIXME: Should be passing -1
+; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm:
+; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD
+
+; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1 at rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
+; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
+
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+ call void @external_void_func_i1(i1 true)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
+; MESA: s_mov_b32 s33, s3{{$}}
+; HSA: s_mov_b32 s33, s9{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext at rel32@hi+4
+; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]]
+; HSA-NEXT: s_mov_b32 s4, s33
+; HSA-NEXT: s_mov_b32 s32, s33
+
+; MESA-DAG: s_mov_b32 s4, s33{{$}}
+; MESA-DAG: s_mov_b32 s32, s33{{$}}
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+ %var = load volatile i1, i1 addrspace(1)* undef
+ call void @external_void_func_i1_signext(i1 %var)
+ ret void
+}
+
+; FIXME: load should be scheduled before getpc
+; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
+; MESA: s_mov_b32 s33, s3{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext at rel32@hi+4
+; GCN-NEXT: buffer_load_ubyte v0
+
+; GCN-DAG: s_mov_b32 s4, s33{{$}}
+; GCN-DAG: s_mov_b32 s32, s33{{$}}
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
+ %var = load volatile i1, i1 addrspace(1)* undef
+ call void @external_void_func_i1_zeroext(i1 %var)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8 at rel32@hi+4
+; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
+
+; HSA-DAG: s_mov_b32 s4, s33{{$}}
+; GCN-DAG: s_mov_b32 s32, s33{{$}}
+
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
+ call void @external_void_func_i8(i8 123)
+ ret void
+}
+
+; FIXME: don't wait before call
+; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
+; HSA-DAG: s_mov_b32 s33, s9{{$}}
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN-DAG: buffer_load_sbyte v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext at rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s3
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
+ %var = load volatile i8, i8 addrspace(1)* undef
+ call void @external_void_func_i8_signext(i8 %var)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+; HSA-DAG: s_mov_b32 s33, s9{{$}}
+
+; GCN-DAG: buffer_load_ubyte v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext at rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
+ %var = load volatile i8, i8 addrspace(1)* undef
+ call void @external_void_func_i8_zeroext(i8 %var)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
+ call void @external_void_func_i16(i16 123)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN-DAG: buffer_load_sshort v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext at rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
+ %var = load volatile i16, i16 addrspace(1)* undef
+ call void @external_void_func_i16_signext(i16 %var)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+
+; GCN-DAG: buffer_load_ushort v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext at rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
+ %var = load volatile i16, i16 addrspace(1)* undef
+ call void @external_void_func_i16_zeroext(i16 %var)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32 at rel32@hi+4
+; GCN: v_mov_b32_e32 v0, 42
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
+ call void @external_void_func_i32(i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm:
+; GCN-DAG: s_movk_i32 [[K0:s[0-9]+]], 0x7b{{$}}
+; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[K0]]
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64 at rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64 at rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v1, [[K1]]
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
+ call void @external_void_func_i64(i64 123)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
+; VI: v_mov_b32_e32 v0, 0x4400
+; CI: v_mov_b32_e32 v0, 4.0
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
+ call void @external_void_func_f16(half 4.0)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm:
+; GCN: v_mov_b32_e32 v0, 4.0
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
+ call void @external_void_func_f32(float 4.0)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
+; GCN: v_mov_b32_e32 v0, 0{{$}}
+; GCN: v_mov_b32_e32 v1, 0x40100000
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
+ call void @external_void_func_f64(double 4.0)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
+; GFX9: buffer_load_dword v0
+; GFX9-NOT: v0
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
+ %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+ call void @external_void_func_v2i16(<2 x i16> %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:
+; GFX9: buffer_load_dword v0
+; GFX9-NOT: v0
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
+ %val = load <2 x half>, <2 x half> addrspace(1)* undef
+ call void @external_void_func_v2f16(<2 x half> %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
+; GCN: buffer_load_dwordx2 v[0:1]
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+ call void @external_void_func_v2i32(<2 x i32> %val)
+ ret void
+}
+
+; FIXME: Passing 4th
+; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
+; HSA-DAG: s_mov_b32 s33, s9
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v0
+; GCN-DAG: v_mov_b32_e32 v1
+; GCN-DAG: v_mov_b32_e32 v2
+; GCN-DAG: v_mov_b32_e32 v3
+
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
+ call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
+; GCN: buffer_load_dwordx4 v[0:3]
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+ call void @external_void_func_v4i32(<4 x i32> %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
+ %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
+ %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+ call void @external_void_func_v8i32(<8 x i32> %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
+ %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
+ %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+ call void @external_void_func_v16i32(<16 x i32> %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v32i32:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
+ %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+ %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+ call void @external_void_func_v32i32(<32 x i32> %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32:
+; HSA-DAG: s_mov_b32 s33, s9
+; HSA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}}
+
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+; MESA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}}
+
+; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+
+; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SP_REG]]{{$}}
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
+ %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+ %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
+ %val1 = load i32, i32 addrspace(1)* undef
+ call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
+ ret void
+}
+
+; FIXME: No wait after call
+; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
+; GCN: v_mov_b32_e32 v0, 42
+; GCN: s_swappc_b64 s[30:31],
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
+define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+ %val = call i32 @external_i32_func_i32(i32 42)
+ store volatile i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
+; GCN: buffer_load_ubyte v0, off
+; GCN: buffer_load_dword v1, off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
+ %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef
+ %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+ call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
+; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
+; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
+; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8
+; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12
+
+; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8
+; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12
+
+; GCN: s_add_u32 [[SP]], [[SP]], 0x200
+
+; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
+; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
+
+; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
+; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
+
+
+; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
+; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
+
+; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
+; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
+
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: s_sub_u32 [[SP]], [[SP]], 0x200
+define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
+ %val = alloca { i8, i32 }, align 4
+ %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0
+ %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 1
+ store i8 3, i8* %gep0
+ store i32 8, i32* %gep1
+ call void @external_void_func_byval_struct_i8_i32({ i8, i32 }* %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
+; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
+; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
+; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
+; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
+
+; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
+; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
+
+; GCN-DAG: s_add_u32 [[SP]], [[SP]], 0x200
+; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
+; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
+; GCN-NEXT: s_swappc_b64
+; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
+; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
+; GCN: s_sub_u32 [[SP]], [[SP]], 0x200
+
+; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off
+; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off
+define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
+ %in.val = alloca { i8, i32 }, align 4
+ %out.val = alloca { i8, i32 }, align 4
+ %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 0
+ %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 1
+ store i8 3, i8* %in.gep0
+ store i32 8, i32* %in.gep1
+ call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* %out.val, { i8, i32 }* %in.val)
+ %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 0
+ %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 1
+ %out.val0 = load i8, i8* %out.gep0
+ %out.val1 = load i32, i32* %out.gep1
+
+ store volatile i8 %out.val0, i8 addrspace(1)* undef
+ store volatile i32 %out.val1, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
+define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
+ %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
+ %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+ call void @external_void_func_v16i8(<16 x i8> %val)
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
Added: llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll?rev=309732&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll Tue Aug 1 12:54:18 2017
@@ -0,0 +1,251 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare void @external_void_func_void() #0
+
+; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_getpc_b64 s[34:35]
+; GCN-NEXT: s_add_u32 s34, s34,
+; GCN-NEXT: s_addc_u32 s35, s35,
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN: s_swappc_b64 s[30:31], s[34:35]
+
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: #ASMSTART
+; GCN-NEXT: #ASMEND
+; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
+define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+ call void @external_void_func_void()
+ call void asm sideeffect "", ""() #0
+ call void @external_void_func_void()
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GCN: v_writelane_b32 v32, s33, 0
+; GCN: v_writelane_b32 v32, s34, 1
+; GCN: v_writelane_b32 v32, s35, 2
+; GCN: v_writelane_b32 v32, s36, 3
+; GCN: v_writelane_b32 v32, s37, 4
+
+; GCN: s_mov_b32 s33, s5
+; GCN: s_swappc_b64
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_swappc_b64
+; GCN: s_mov_b32 s5, s33
+; GCN: v_readlane_b32 s37, v32, 4
+; GCN: v_readlane_b32 s36, v32, 3
+; GCN: v_readlane_b32 s35, v32, 2
+; GCN: v_readlane_b32 s34, v32, 1
+; GCN: v_readlane_b32 s33, v32, 0
+; GCN: s_setpc_b64
+define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+ call void @external_void_func_void()
+ call void asm sideeffect "", ""() #0
+ call void @external_void_func_void()
+ ret void
+}
+
+; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31:
+; GCN: s_waitcnt
+; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31]
+; GCN-NEXT: #ASMSTART
+; GCN: ; clobber
+; GCN-NEXT: #ASMEND
+; GCN-NEXT: s_mov_b64 s[30:31], [[SAVEPC]]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+define void @void_func_void_clobber_s30_s31() #2 {
+ call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}void_func_void_clobber_vcc:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
+define void @void_func_void_clobber_vcc() #2 {
+ call void asm sideeffect "", "~{VCC}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc:
+; GCN: s_getpc_b64
+; GCN-NEXT: s_add_u32
+; GCN-NEXT: s_addc_u32
+; GCN: s_mov_b64 s[34:35], vcc
+; GCN-NEXT: s_swappc_b64
+; GCN: s_mov_b64 vcc, s[34:35]
+define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)* %out) #0 {
+ %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
+ call void @void_func_void_clobber_vcc()
+ %val0 = load volatile i32, i32 addrspace(1)* undef
+ %val1 = load volatile i32, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "{vcc}"(i64 %vcc)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
+; GCN: s_mov_b32 s33, s31
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: s_mov_b32 s31, s33
+define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
+ %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
+; GCN: v_mov_b32_e32 v32, v31
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: v_mov_b32_e32 v31, v32
+define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
+ %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
+; GCN: s_mov_b32 s34, s9
+; GCN: ; def s33
+; GCN-NEXT: #ASMEND
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void at rel32@hi+4
+; GCN-NEXT: s_mov_b32 s4, s34
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use s33
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 {
+ %s33 = call i32 asm sideeffect "; def $0", "={s33}"()
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
+; GCN: s_mov_b32 s33, s9
+; GCN: ; def v32
+; GCN-NEXT: #ASMEND
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void at rel32@hi+4
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v32
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
+ %v32 = call i32 asm sideeffect "; def $0", "={v32}"()
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
+ ret void
+}
+
+; GCN-LABEL: {{^}}void_func_void_clobber_s33:
+; GCN: v_writelane_b32 v0, s33, 0
+; GCN-NEXT: #ASMSTART
+; GCN-NEXT: ; clobber
+; GCN-NEXT: #ASMEND
+; GCN-NEXT: v_readlane_b32 s33, v0, 0
+; GCN-NEXT: s_setpc_b64
+define void @void_func_void_clobber_s33() #2 {
+ call void asm sideeffect "; clobber", "~{s33}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_getpc_b64
+; GCN-NEXT: s_add_u32
+; GCN-NEXT: s_addc_u32
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
+ call void @void_func_void_clobber_s33()
+ ret void
+}
+
+; GCN-LABEL: {{^}}callee_saved_sgpr_func:
+; GCN-NOT: s40
+; GCN: v_writelane_b32 v32, s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+; GCN: v_readlane_b32 s40, v32
+; GCN-NOT: s40
+define void @callee_saved_sgpr_func() #2 {
+ %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}callee_saved_sgpr_kernel:
+; GCN-NOT: s40
+; GCN: ; def s40
+; GCN-NOT: s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
+ %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+ ret void
+}
+
+; First call preserved VGPR is used so it can't be used for SGPR spills.
+; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
+; GCN-NOT: s40
+; GCN: v_writelane_b32 v33, s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+; GCN: v_readlane_b32 s40, v33
+; GCN-NOT: s40
+define void @callee_saved_sgpr_vgpr_func() #2 {
+ %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+ %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+ call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel:
+; GCN-NOT: s40
+; GCN: ; def s40
+; GCN-NOT: s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 {
+ %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+ %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+ call void @external_void_func_void()
+ call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+ call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
Added: llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll?rev=309732&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll Tue Aug 1 12:54:18 2017
@@ -0,0 +1,241 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare void @external_void_func_void() #0
+
+declare i1 @external_i1_func_void() #0
+declare zeroext i1 @external_i1_zeroext_func_void() #0
+declare signext i1 @external_i1_signext_func_void() #0
+
+declare i8 @external_i8_func_void() #0
+declare zeroext i8 @external_i8_zeroext_func_void() #0
+declare signext i8 @external_i8_signext_func_void() #0
+
+declare i16 @external_i16_func_void() #0
+declare zeroext i16 @external_i16_zeroext_func_void() #0
+declare signext i16 @external_i16_signext_func_void() #0
+
+declare i32 @external_i32_func_void() #0
+declare i64 @external_i64_func_void() #0
+declare half @external_f16_func_void() #0
+declare float @external_f32_func_void() #0
+declare double @external_f64_func_void() #0
+
+declare <2 x i32> @external_v2i32_func_void() #0
+declare <3 x i32> @external_v3i32_func_void() #0
+declare <4 x i32> @external_v4i32_func_void() #0
+declare <5 x i32> @external_v5i32_func_void() #0
+declare <8 x i32> @external_v8i32_func_void() #0
+declare <16 x i32> @external_v16i32_func_void() #0
+declare <32 x i32> @external_v32i32_func_void() #0
+declare { <32 x i32>, i32 } @external_v32i32_i32_func_void() #0
+declare <2 x i16> @external_v2i16_func_void() #0
+declare <2 x half> @external_v2f16_func_void() #0
+
+declare { i32, i64 } @external_i32_i64_func_void() #0
+
+; GCN-LABEL: {{^}}test_call_external_void_func_void:
+define amdgpu_kernel void @test_call_external_void_func_void() #0 {
+ call void @external_void_func_void()
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_void_x2:
+define amdgpu_kernel void @test_call_external_void_func_void_x2() #0 {
+ call void @external_void_func_void()
+ call void @external_void_func_void()
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i1_func_void:
+define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
+ %val = call i1 @external_i1_func_void()
+ store volatile i1 %val, i1 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i1_zeroext_func_void:
+define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
+ %val = call i1 @external_i1_zeroext_func_void()
+ %val.ext = zext i1 %val to i32
+ store volatile i32 %val.ext, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i1_signext_func_void:
+define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
+ %val = call i1 @external_i1_signext_func_void()
+ %val.ext = zext i1 %val to i32
+ store volatile i32 %val.ext, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i8_func_void:
+define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
+ %val = call i8 @external_i8_func_void()
+ store volatile i8 %val, i8 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i8_zeroext_func_void:
+define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
+ %val = call i8 @external_i8_zeroext_func_void()
+ %val.ext = zext i8 %val to i32
+ store volatile i32 %val.ext, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i8_signext_func_void:
+define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 {
+ %val = call i8 @external_i8_signext_func_void()
+ %val.ext = zext i8 %val to i32
+ store volatile i32 %val.ext, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i16_func_void:
+define amdgpu_kernel void @test_call_external_i16_func_void() #0 {
+ %val = call i16 @external_i16_func_void()
+ store volatile i16 %val, i16 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i16_zeroext_func_void:
+define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 {
+ %val = call i16 @external_i16_zeroext_func_void()
+ %val.ext = zext i16 %val to i32
+ store volatile i32 %val.ext, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i16_signext_func_void:
+define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 {
+ %val = call i16 @external_i16_signext_func_void()
+ %val.ext = zext i16 %val to i32
+ store volatile i32 %val.ext, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i32_func_void:
+define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
+ %val = call i32 @external_i32_func_void()
+ store volatile i32 %val, i32 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i64_func_void:
+define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
+ %val = call i64 @external_i64_func_void()
+ store volatile i64 %val, i64 addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_f16_func_void:
+define amdgpu_kernel void @test_call_external_f16_func_void() #0 {
+ %val = call half @external_f16_func_void()
+ store volatile half %val, half addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_f32_func_void:
+define amdgpu_kernel void @test_call_external_f32_func_void() #0 {
+ %val = call float @external_f32_func_void()
+ store volatile float %val, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_f64_func_void:
+define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
+ %val = call double @external_f64_func_void()
+ store volatile double %val, double addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v2i32_func_void:
+define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
+ %val = call <2 x i32> @external_v2i32_func_void()
+ store volatile <2 x i32> %val, <2 x i32> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v3i32_func_void:
+define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
+ %val = call <3 x i32> @external_v3i32_func_void()
+ store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v4i32_func_void:
+define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
+ %val = call <4 x i32> @external_v4i32_func_void()
+ store volatile <4 x i32> %val, <4 x i32> addrspace(1)* undef, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v5i32_func_void:
+define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
+ %val = call <5 x i32> @external_v5i32_func_void()
+ store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v8i32_func_void:
+define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
+ %val = call <8 x i32> @external_v8i32_func_void()
+ store volatile <8 x i32> %val, <8 x i32> addrspace(1)* undef, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v16i32_func_void:
+define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
+ %val = call <16 x i32> @external_v16i32_func_void()
+ store volatile <16 x i32> %val, <16 x i32> addrspace(1)* undef, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v32i32_func_void:
+define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
+ %val = call <32 x i32> @external_v32i32_func_void()
+ store volatile <32 x i32> %val, <32 x i32> addrspace(1)* undef, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v2i16_func_void:
+define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 {
+ %val = call <2 x i16> @external_v2i16_func_void()
+ store volatile <2 x i16> %val, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v2f16_func_void:
+define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 {
+ %val = call <2 x half> @external_v2f16_func_void()
+ store volatile <2 x half> %val, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void:
+define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
+ %val = call { i32, i64 } @external_i32_i64_func_void()
+ %val.0 = extractvalue { i32, i64 } %val, 0
+ %val.1 = extractvalue { i32, i64 } %val, 1
+ store volatile i32 %val.0, i32 addrspace(1)* undef
+ store volatile i64 %val.1, i64 addrspace(1)* undef
+ ret void
+}
+
+; Requires writing results to stack
+; GCN-LABEL: {{^}}test_call_external_v32i32_i32_func_void:
+define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 {
+ %val = call { <32 x i32>, i32 } @external_v32i32_i32_func_void()
+ %val0 = extractvalue { <32 x i32>, i32 } %val, 0
+ %val1 = extractvalue { <32 x i32>, i32 } %val, 1
+ store volatile <32 x i32> %val0, <32 x i32> addrspace(1)* undef, align 8
+ store volatile i32 %val1, i32 addrspace(1)* undef
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll?rev=309732&r1=309731&r2=309732&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll Tue Aug 1 12:54:18 2017
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}callee_no_stack:
; GCN: ; BB#0:
@@ -8,6 +9,14 @@ define void @callee_no_stack() #0 {
ret void
}
+; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim:
+; GCN: ; BB#0:
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @callee_no_stack_no_fp_elim() #1 {
+ ret void
+}
+
; Requires frame pointer for access to local regular object.
; GCN-LABEL: {{^}}callee_with_stack:
@@ -24,4 +33,51 @@ define void @callee_with_stack() #0 {
ret void
}
+; GCN-LABEL: {{^}}callee_with_stack_and_call:
+; GCN: ; BB#0:
+; GCN-NEXT: s_waitcnt
+
+; GCN-DAG: s_mov_b32 s5, s32
+; GCN-DAG: v_writelane_b32 v32, s33,
+; GCN-DAG: v_writelane_b32 v32, s34,
+; GCN-DAG: v_writelane_b32 v32, s35,
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
+; GCN-DAG: s_mov_b32 s33, s5
+
+
+; GCN: s_swappc_b64
+; GCN: s_mov_b32 s5, s33
+; GCN-DAG: v_readlane_b32 s35,
+; GCN-DAG: v_readlane_b32 s34,
+; GCN-DAG: v_readlane_b32 s33,
+; GCN: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @callee_with_stack_and_call() #0 {
+ %alloca = alloca i32
+ store volatile i32 0, i32* %alloca
+ call void @external_void_func_void()
+ ret void
+}
+
+; Should be able to copy incoming stack pointer directly to inner
+; call's stack pointer argument.
+
+; GCN-LABEL: {{^}}callee_no_stack_with_call:
+; GCN: s_waitcnt
+; GCN-NOT: s32
+; GCN: s_mov_b32 s33, s5
+; GCN: s_swappc_b64
+; GCN: s_mov_b32 s5, s33
+; GCN-NOT: s32
+; GCN: s_setpc_b64
+define void @callee_no_stack_with_call() #0 {
+ call void @external_void_func_void()
+ ret void
+}
+
+declare void @external_void_func_void() #0
+
attributes #0 = { nounwind }
+attributes #1 = { nounwind "no-frame-pointer-elim"="true" }
Added: llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll?rev=309732&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll Tue Aug 1 12:54:18 2017
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VI %s
+
+; Test calls when called by other callable functions rather than
+; kernels.
+
+declare void @external_void_func_i32(i32) #0
+
+; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
+; GCN: s_waitcnt
+; GCN-NOT: s32
+; GCN: s_swappc_b64
+; GCN-NOT: s32
+; GCN: s_setpc_b64
+define void @test_func_call_external_void_func_i32_imm() #0 {
+ call void @external_void_func_i32(i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
+; GCN: s_waitcnt
+; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0x1100{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
+; GCN: s_swappc_b64
+; GCN: s_sub_u32 s32, s32, 0x1100{{$}}
+; GCN: s_setpc_b64
+define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
+ %alloca = alloca [16 x i32], align 4
+ %gep0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
+ %gep15 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 16
+ store volatile i32 0, i32* %gep0
+ store volatile i32 0, i32* %gep15
+ call void @external_void_func_i32(i32 42)
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
More information about the llvm-commits
mailing list