[llvm] d88f96d - ARM: support mandatory tail calls for tailcc & swifttailcc
Tim Northover via llvm-commits
llvm-commits at lists.llvm.org
Fri May 28 03:12:14 PDT 2021
Author: Tim Northover
Date: 2021-05-28T11:10:51+01:00
New Revision: d88f96dff3f192fc0c1bf57f7810b95a709b3591
URL: https://github.com/llvm/llvm-project/commit/d88f96dff3f192fc0c1bf57f7810b95a709b3591
DIFF: https://github.com/llvm/llvm-project/commit/d88f96dff3f192fc0c1bf57f7810b95a709b3591.diff
LOG: ARM: support mandatory tail calls for tailcc & swifttailcc
This adds support for callee-pop conventions to the ARM backend so that it can
ensure a call marked "tail" is actually a tail call.
Added:
llvm/test/CodeGen/ARM/fastcc-tailcall.ll
llvm/test/CodeGen/ARM/swifttailcc-call.ll
llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll
llvm/test/CodeGen/ARM/tailcc-call.ll
Modified:
llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
llvm/lib/Target/ARM/ARMCallingConv.td
llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
llvm/lib/Target/ARM/ARMFastISel.cpp
llvm/lib/Target/ARM/ARMFrameLowering.cpp
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/lib/Target/ARM/ARMISelLowering.h
llvm/lib/Target/ARM/ARMInstrInfo.td
llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
llvm/lib/Target/ARM/ARMSubtarget.cpp
llvm/test/CodeGen/ARM/dbg-tcreturn.ll
llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
llvm/test/CodeGen/ARM/v8m-tail-call.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 33d5aaf6b8c2..2f7f3eee9844 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -79,6 +79,11 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_NoRegs_SaveList;
} else if (F.getCallingConv() == CallingConv::CFGuard_Check) {
return CSR_Win_AAPCS_CFGuard_Check_SaveList;
+ } else if (F.getCallingConv() == CallingConv::SwiftTail) {
+ return STI.isTargetDarwin()
+ ? CSR_iOS_SwiftTail_SaveList
+ : (UseSplitPush ? CSR_AAPCS_SplitPush_SwiftTail_SaveList
+ : CSR_AAPCS_SwiftTail_SaveList);
} else if (F.hasFnAttribute("interrupt")) {
if (STI.isMClass()) {
// M-class CPUs have hardware which saves the registers needed to allow a
@@ -129,6 +134,10 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_NoRegs_RegMask;
if (CC == CallingConv::CFGuard_Check)
return CSR_Win_AAPCS_CFGuard_Check_RegMask;
+ if (CC == CallingConv::SwiftTail) {
+ return STI.isTargetDarwin() ? CSR_iOS_SwiftTail_RegMask
+ : CSR_AAPCS_SwiftTail_RegMask;
+ }
if (STI.getTargetLowering()->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
return STI.isTargetDarwin() ? CSR_iOS_SwiftError_RegMask
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
index 3517274e4c5c..a6dbe563a4ab 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -278,6 +278,9 @@ def CSR_Win_AAPCS_CFGuard_Check : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7,
// R8 is used to pass swifterror, remove it from CSR.
def CSR_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS, R8)>;
+// R10 is used to pass swiftself, remove it from CSR.
+def CSR_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS, R10)>;
+
// The order of callee-saved registers needs to match the order we actually push
// them in FrameLowering, because this order is what's used by
// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
@@ -290,6 +293,10 @@ def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
def CSR_AAPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
R8)>;
+// R10 is used to pass swifterror, remove it from CSR.
+def CSR_AAPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
+ R10)>;
+
// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
// and the pointer return value are both passed in R0 in these cases, this can
// be partially modelled by treating R0 as a callee-saved register
@@ -305,6 +312,9 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
// R8 is used to pass swifterror, remove it from CSR.
def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>;
+// R10 is used to pass swiftself, remove it from CSR.
+def CSR_iOS_SwiftTail : CalleeSavedRegs<(sub CSR_iOS, R10)>;
+
def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
(sub CSR_AAPCS_ThisReturn, R9))>;
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index deea297c02ea..8d6bc063d6ef 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2037,7 +2037,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
}
auto NewMI = std::prev(MBBI);
- for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+ for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
NewMI->addOperand(MBBI->getOperand(i));
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 73cb3a218827..32c54e8712cf 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -1849,6 +1849,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
}
case CallingConv::ARM_AAPCS_VFP:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
if (!isVarArg)
return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
// Fall through to soft float variant, variadic functions don't
@@ -3014,6 +3015,7 @@ bool ARMFastISel::fastLowerArguments() {
case CallingConv::ARM_AAPCS:
case CallingConv::ARM_APCS:
case CallingConv::Swift:
+ case CallingConv::SwiftTail:
break;
}
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index fea771831b4d..c115cc1e413d 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -237,6 +237,41 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
}
+// Returns how much of the incoming argument stack area we should clean up in an
+// epilogue. For the C calling convention this will be 0, for guaranteed tail
+// call conventions it can be positive (a normal return or a tail call to a
+// function that uses less stack space for arguments) or negative (for a tail
+// call to a function that needs more stack space than us for arguments).
+static int getArgumentStackToRestore(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ bool IsTailCallReturn = false;
+ if (MBB.end() != MBBI) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ IsTailCallReturn = RetOpcode == ARM::TCRETURNdi ||
+ RetOpcode == ARM::TCRETURNri;
+ }
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+ int ArgumentPopSize = 0;
+ if (IsTailCallReturn) {
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+ // For a tail-call in a callee-pops-arguments environment, some or all of
+ // the stack may actually be in use for the call's arguments, this is
+ // calculated during LowerCall and consumed here...
+ ArgumentPopSize = StackAdjust.getImm();
+ } else {
+ // ... otherwise the amount to pop is *all* of the argument space,
+ // conveniently stored in the MachineFunctionInfo by
+ // LowerFormalArguments. This will, of course, be zero for the C calling
+ // convention.
+ ArgumentPopSize = AFI->getArgumentStackToRestore();
+ }
+
+ return ArgumentPopSize;
+}
+
static void emitRegPlusImmediate(
bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
@@ -868,7 +903,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
"This emitEpilogue does not support Thumb1!");
bool isARM = !AFI->isThumbFunction();
- unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ // Amount of stack space we reserved next to incoming args for either
+ // varargs registers or stack arguments in tail calls made by this function.
+ unsigned ReservedArgStack = AFI->getArgRegsSaveSize();
+
+ // How much of the stack used by incoming arguments this function is expected
+ // to restore in this particular epilogue.
+ int IncomingArgStackToRestore = getArgumentStackToRestore(MF, MBB);
int NumBytes = (int)MFI.getStackSize();
Register FramePtr = RegInfo->getFrameRegister(MF);
@@ -882,8 +923,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
if (!AFI->hasStackFrame()) {
- if (NumBytes - ArgRegsSaveSize != 0)
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize,
+ if (NumBytes - ReservedArgStack != 0)
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ReservedArgStack,
MachineInstr::FrameDestroy);
} else {
// Unwind MBBI to point to first LDR / VLDRD.
@@ -897,7 +938,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
}
// Move SP to start of FP callee save spill area.
- NumBytes -= (ArgRegsSaveSize +
+ NumBytes -= (ReservedArgStack +
AFI->getFPCXTSaveAreaSize() +
AFI->getGPRCalleeSavedArea1Size() +
AFI->getGPRCalleeSavedArea2Size() +
@@ -969,9 +1010,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
if (AFI->getFPCXTSaveAreaSize()) MBBI++;
}
- if (ArgRegsSaveSize)
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize,
+ if (ReservedArgStack || IncomingArgStackToRestore) {
+ assert(ReservedArgStack + IncomingArgStackToRestore >= 0 &&
+ "attempting to restore negative stack amount");
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII,
+ ReservedArgStack + IncomingArgStackToRestore,
MachineInstr::FrameDestroy);
+ }
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -2288,31 +2333,37 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
MachineBasicBlock::iterator I) const {
const ARMBaseInstrInfo &TII =
*static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ bool isARM = !AFI->isThumbFunction();
+ DebugLoc dl = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII.getCallFrameDestroyOpcode();
+ unsigned CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ assert(!AFI->isThumb1OnlyFunction() &&
+ "This eliminateCallFramePseudoInstr does not support Thumb1!");
+
+ int PIdx = I->findFirstPredOperandIdx();
+ ARMCC::CondCodes Pred = (PIdx == -1)
+ ? ARMCC::AL
+ : (ARMCC::CondCodes)I->getOperand(PIdx).getImm();
+ unsigned PredReg = TII.getFramePred(*I);
+
if (!hasReservedCallFrame(MF)) {
+ // Bail early if the callee is expected to do the adjustment.
+ if (IsDestroy && CalleePopAmount != -1U)
+ return MBB.erase(I);
+
// If we have alloca, convert as follows:
// ADJCALLSTACKDOWN -> sub, sp, sp, amount
// ADJCALLSTACKUP -> add, sp, sp, amount
- MachineInstr &Old = *I;
- DebugLoc dl = Old.getDebugLoc();
- unsigned Amount = TII.getFrameSize(Old);
+ unsigned Amount = TII.getFrameSize(*I);
if (Amount != 0) {
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
Amount = alignSPAdjust(Amount);
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- assert(!AFI->isThumb1OnlyFunction() &&
- "This eliminateCallFramePseudoInstr does not support Thumb1!");
- bool isARM = !AFI->isThumbFunction();
-
- // Replace the pseudo instruction with a new instruction...
- unsigned Opc = Old.getOpcode();
- int PIdx = Old.findFirstPredOperandIdx();
- ARMCC::CondCodes Pred =
- (PIdx == -1) ? ARMCC::AL
- : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
- unsigned PredReg = TII.getFramePred(Old);
if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
Pred, PredReg);
@@ -2322,6 +2373,11 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
Pred, PredReg);
}
}
+ } else if (CalleePopAmount != -1U) {
+ // If the calling convention demands that the callee pops arguments from the
+ // stack, we want to add it back if we have a reserved call frame.
+ emitSPUpdate(isARM, MBB, I, dl, TII, -CalleePopAmount,
+ MachineInstr::NoFlags, Pred, PredReg);
}
return MBB.erase(I);
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3f4321b23260..4dee7438c955 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2008,6 +2008,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::SwiftTail:
return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
case CallingConv::C:
+ case CallingConv::Tail:
if (!Subtarget->isAAPCS_ABI())
return CallingConv::ARM_APCS;
else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
@@ -2184,19 +2185,31 @@ SDValue ARMTargetLowering::LowerCallResult(
return Chain;
}
-/// LowerMemOpCallTo - Store the argument to the stack.
-SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
- SDValue Arg, const SDLoc &dl,
- SelectionDAG &DAG,
- const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const {
- unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
- PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
- StackPtr, PtrOff);
- return DAG.getStore(
- Chain, dl, Arg, PtrOff,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
+ const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
+ bool IsTailCall, int SPDiff) const {
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ int32_t Offset = VA.getLocMemOffset();
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (IsTailCall) {
+ Offset += SPDiff;
+ auto PtrVT = getPointerTy(DAG.getDataLayout());
+ int Size = VA.getLocVT().getFixedSizeInBits() / 8;
+ int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+ DstAddr = DAG.getFrameIndex(FI, PtrVT);
+ DstInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+ } else {
+ SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+ DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ StackPtr, PtrOff);
+ DstInfo =
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
+ }
+
+ return std::make_pair(DstAddr, DstInfo);
}
void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
@@ -2205,7 +2218,8 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVectorImpl<SDValue> &MemOpChains,
- ISD::ArgFlagsTy Flags) const {
+ bool IsTailCall,
+ int SPDiff) const {
SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Arg);
unsigned id = Subtarget->isLittle() ? 0 : 1;
@@ -2219,12 +2233,20 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
getPointerTy(DAG.getDataLayout()));
- MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
- dl, DAG, NextVA,
- Flags));
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ std::tie(DstAddr, DstInfo) =
+ computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
}
}
+static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
+ return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
+ CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
+}
+
/// LowerCall - Lowering a call into a callseq_start <-
/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
/// nodes.
@@ -2249,6 +2271,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
bool isCmseNSCall = false;
+ bool isSibCall = false;
bool PreferIndirect = false;
// Determine whether this is a non-secure function call.
@@ -2288,6 +2311,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
+
+ if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
+ CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
+ isSibCall = true;
+
// We don't support GuaranteedTailCallOpt for ARM, only automatically
// detected sibcalls.
if (isTailCall)
@@ -2303,13 +2331,40 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
- if (isTailCall) {
- // For tail calls, memory operands are available in our caller's stack.
+ // SPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int SPDiff = 0;
+
+ if (isTailCall && !isSibCall) {
+ auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
+ unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
+
+ // Since callee will pop argument stack as a tail call, we must keep the
+ // popped size 16-byte aligned.
+ Align StackAlign = DAG.getDataLayout().getStackAlignment();
+ NumBytes = alignTo(NumBytes, StackAlign);
+
+ // SPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // can actually shrink the stack.
+ SPDiff = NumReusableBytes - NumBytes;
+
+ // If this call requires more stack than we have available from
+ // LowerFormalArguments, tell FrameLowering to reserve space for it.
+ if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
+ AFI->setArgRegsSaveSize(-SPDiff);
+ }
+
+ if (isSibCall) {
+ // For sibling tail calls, memory operands are available in our caller's stack.
NumBytes = 0;
} else {
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+ Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
}
SDValue StackPtr =
@@ -2318,6 +2373,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPassVector RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
+ // During a tail call, stores to the argument area must happen after all of
+ // the function's incoming arguments have been loaded because they may alias.
+ // This is done by folding in a TokenFactor from LowerFormalArguments, but
+ // there's no point in doing so repeatedly so this tracks whether that's
+ // happened yet.
+ bool AfterFormalArgLoads = false;
+
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization, arguments are handled later.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
@@ -2346,6 +2408,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
break;
}
+ if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
+ Chain = DAG.getStackArgumentTokenFactor(Chain);
+ AfterFormalArgLoads = true;
+ }
+
// f16 arguments have their size extended to 4 bytes and passed as if they
// had been copied to the LSBs of a 32-bit register.
// For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
@@ -2375,21 +2442,23 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(1, dl, MVT::i32));
PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
- StackPtr, MemOpChains, Flags);
+ StackPtr, MemOpChains, isTailCall, SPDiff);
VA = ArgLocs[++i]; // skip ahead to next loc
if (VA.isRegLoc()) {
PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
- StackPtr, MemOpChains, Flags);
+ StackPtr, MemOpChains, isTailCall, SPDiff);
} else {
assert(VA.isMemLoc());
-
- MemOpChains.push_back(
- LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ std::tie(DstAddr, DstInfo) =
+ computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
+ MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
}
} else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
- StackPtr, MemOpChains, Flags);
+ StackPtr, MemOpChains, isTailCall, SPDiff);
} else if (VA.isRegLoc()) {
if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i32) {
@@ -2439,9 +2508,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Flags.getByValSize() > 4*offset) {
auto PtrVT = getPointerTy(DAG.getDataLayout());
- unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
- SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
+ SDValue Dst;
+ MachinePointerInfo DstInfo;
+ std::tie(Dst, DstInfo) =
+ computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
@@ -2454,11 +2524,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
Ops));
}
- } else if (!isTailCall) {
+ } else {
assert(VA.isMemLoc());
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+ std::tie(DstAddr, DstInfo) =
+ computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
- MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
- dl, DAG, VA, Flags));
+ SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
}
}
@@ -2622,10 +2696,24 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
}
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (isTailCall && !isSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ InFlag = Chain.getValue(1);
+ }
+
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
+ if (isTailCall) {
+ Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
+ }
+
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@@ -2670,8 +2758,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+ // If we're guaranteeing tail-calls will be honoured, the callee must
+ // pop its own argument stack on return. But this call is *not* a tail call so
+ // we need to undo that after it returns to restore the status-quo.
+ bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+ uint64_t CalleePopBytes =
+ canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
+
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ DAG.getIntPtrConstant(CalleePopBytes, dl, true),
+ InFlag, dl);
if (!Ins.empty())
InFlag = Chain.getValue(1);
@@ -2812,6 +2908,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
if (CallerF.hasFnAttribute("interrupt"))
return false;
+ if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+ return CalleeCC == CallerCC;
+
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
@@ -4460,7 +4559,17 @@ SDValue ARMTargetLowering::LowerFormalArguments(
}
}
- AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+ unsigned StackArgSize = CCInfo.getNextStackOffset();
+ bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+ if (canGuaranteeTCO(CallConv, TailCallOpt)) {
+ // The only way to guarantee a tail call is if the callee restores its
+ // argument area, but it must also keep the stack aligned when doing so.
+ const DataLayout &DL = DAG.getDataLayout();
+ StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
+
+ AFI->setArgumentStackToRestore(StackArgSize);
+ }
+ AFI->setArgumentStackSize(StackArgSize);
if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
DiagnosticInfoUnsupported Diag(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 80e4e12c702e..5b4a96d93cab 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -756,7 +756,8 @@ class VectorType;
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVectorImpl<SDValue> &MemOpChains,
- ISD::ArgFlagsTy Flags) const;
+ bool IsTailCall,
+ int SPDiff) const;
SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG,
const SDLoc &dl) const;
@@ -765,10 +766,10 @@ class VectorType;
bool isVarArg) const;
CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
bool isVarArg) const;
- SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
- const SDLoc &dl, SelectionDAG &DAG,
- const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const;
+ std::pair<SDValue, MachinePointerInfo>
+ computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA, SDValue StackPtr,
+ bool IsTailCall, int SPDiff) const;
SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 6b61c598f572..65c7fb5ca118 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -66,7 +66,7 @@ def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
SDTCisInt<1>]>;
-def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARMTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
@@ -2629,10 +2629,10 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
// Tail calls.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
- def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+ def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, i32imm:$SPDiff), IIC_Br, []>,
Sched<[WriteBr]>;
- def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+ def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, i32imm:$SPDiff), IIC_Br, []>,
Sched<[WriteBr]>;
def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
@@ -6003,9 +6003,12 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst),
// TODO: add,sub,and, 3-instr forms?
// Tail calls. These patterns also apply to Thumb mode.
-def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
-def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret tcGPR:$dst, (i32 timm:$SPDiff)),
+ (TCRETURNri tcGPR:$dst, timm:$SPDiff)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst), (i32 timm:$SPDiff)),
+ (TCRETURNdi texternalsym:$dst, (i32 timm:$SPDiff))>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst), (i32 timm:$SPDiff)),
+ (TCRETURNdi texternalsym:$dst, i32imm:$SPDiff)>;
// Direct calls
def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 298c8a238987..851655284060 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -43,7 +43,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// "attach" GPR-part to the part that was passed via stack.
unsigned StByValParamsPadding = 0;
- /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+ /// ArgsRegSaveSize - Size of the register save area for vararg functions or
+ /// those making guaranteed tail calls that need more stack argument space
+ /// than is provided by this functions incoming parameters.
///
unsigned ArgRegsSaveSize = 0;
@@ -118,6 +120,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// being passed on the stack
unsigned ArgumentStackSize = 0;
+ /// ArgumentStackToRestore - amount of bytes on stack consumed that we must
+ /// restore on return.
+ unsigned ArgumentStackToRestore = 0;
+
/// CoalescedWeights - mapping of basic blocks to the rolling counter of
/// coalesced weights.
DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
@@ -195,6 +201,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
unsigned getArgumentStackSize() const { return ArgumentStackSize; }
void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+ unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+ void setArgumentStackToRestore(unsigned v) { ArgumentStackToRestore = v; }
+
void initPICLabelUId(unsigned UId) {
PICLabelUId = UId;
}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 5cb608b74ace..90f1b693fec6 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -230,7 +230,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// registers are the 4 used for parameters. We don't currently do this
// case.
- SupportsTailCall = !isThumb() || hasV8MBaselineOps();
+ SupportsTailCall = !isThumb1Only() || hasV8MBaselineOps();
if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
SupportsTailCall = false;
diff --git a/llvm/test/CodeGen/ARM/dbg-tcreturn.ll b/llvm/test/CodeGen/ARM/dbg-tcreturn.ll
index d4061be98180..037fda116f38 100644
--- a/llvm/test/CodeGen/ARM/dbg-tcreturn.ll
+++ b/llvm/test/CodeGen/ARM/dbg-tcreturn.ll
@@ -12,7 +12,7 @@ target triple = "thumbv7-apple-ios7.0.0"
; CHECK-NEXT: $r0 = COPY %0
; CHECK-NEXT: $r1 = COPY %1
; CHECK-NEXT: DBG_VALUE $noreg, $noreg, !13, !DIExpression(), debug-location !16
-; CHECK-NEXT: TCRETURNdi &__divsi3, implicit $sp, implicit $r0, implicit $r1
+; CHECK-NEXT: TCRETURNdi &__divsi3, 0, implicit $sp, implicit $r0, implicit $r1
define i32 @test(i32 %a1, i32 %a2) !dbg !5 {
entry:
diff --git a/llvm/test/CodeGen/ARM/fastcc-tailcall.ll b/llvm/test/CodeGen/ARM/fastcc-tailcall.ll
new file mode 100644
index 000000000000..fc0717278b7e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/fastcc-tailcall.ll
@@ -0,0 +1,193 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos -tailcallopt | FileCheck %s
+
+declare fastcc void @callee_stack0()
+declare fastcc void @callee_stack4([4 x i32], i32)
+declare fastcc void @callee_stack20([4 x i32], [5 x i32])
+declare extern_weak fastcc void @callee_weak()
+
+define fastcc void @caller_to0_from0() nounwind {
+; CHECK-LABEL: _caller_to0_from0:
+
+ tail call fastcc void @callee_stack0()
+ ret void
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: b.w _callee_stack0
+}
+
+define fastcc void @caller_to0_from4([4 x i32], i32) {
+; CHECK-LABEL: _caller_to0_from4:
+
+ tail call fastcc void @callee_stack0()
+ ret void
+
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack0
+}
+
+define fastcc void @caller_to4_from0() {
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+ tail call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK-LABEL: _caller_to4_from0:
+; CHECK: sub sp, #16
+; CHECK: movs [[TMP:r[0-9]+]], #42
+; CHECK: str [[TMP]], [sp]
+; CHECK-NOT: add sp
+; CHECK: b.w _callee_stack4
+
+}
+
+define fastcc void @caller_to4_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to4_from4:
+; CHECK-NOT: sub sp
+; Key point is that the "%a" should go where at SP on entry.
+ tail call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK: str {{r[0-9]+}}, [sp]
+; CHECK-NOT: add sp
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+define fastcc void @caller_to20_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to20_from4:
+; CHECK: sub sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; _callee will not deallocate the space, even in fastcc.
+ tail call fastcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5])
+
+; CHECK: str {{.*}}, [sp]
+; CHECK: str {{.*}}, [sp, #4]
+; CHECK: str {{.*}}, [sp, #8]
+; CHECK: str {{.*}}, [sp, #12]
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK-NOT: add sp
+; CHECK-NOT: sub sp
+; CHECK: b.w _callee_stack20
+ ret void
+}
+
+
+define fastcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: _caller_to4_from24:
+
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+ tail call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+
+define fastcc void @caller_to20_from20([4 x i32], [5 x i32] %a) {
+; CHECK-LABEL: _caller_to20_from20:
+; CHECK-NOT: add sp,
+; CHECK-NOT: sub sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b.w will be wrongly clobbered.
+ tail call fastcc void @callee_stack20([4 x i32] undef, [5 x i32] %a)
+ ret void
+
+ ; If these ever get interleaved make sure aliasing slots don't clobber each
+ ; other.
+; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12]
+; CHECK: ldm.w sp,
+; CHECK: stm.w
+; CHECK: strd
+; CHECK-NEXT: b.w _callee_stack20
+}
+
+define fastcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" {
+; CHECK-LABEL: disable_tail_calls:
+
+ tail call fastcc void @callee_stack0()
+ ret void
+
+; CHECK: bl _callee_stack0
+; CHECK: ret
+}
+
+define fastcc void @normal_ret_with_stack([4 x i32], i32 %a) {
+; CHECK: _normal_ret_with_stack:
+; CHECK: add sp, #16
+; CHECK: bx lr
+ ret void
+}
+
+declare { [2 x float] } @get_vec2()
+
+define void @fromC_totail() {
+; COMMON-LABEL: fromC_totail:
+; COMMON: puch {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; We must reset the stack to where it was before the call by undoing its extra stack pop.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl callee_stack4
+; COMMON: sub sp, #16
+
+ call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+define void @fromC_totail_noreservedframe(i32 %len) {
+; COMMON-LABEL: fromC_totail_noreservedframe:
+; COMMON: sub.w sp, sp, r{{.*}}
+
+; COMMON: movs [[TMP:r[0-9]+]], #42
+ ; Note stack is subtracted here to allocate space for arg
+; COMMON: sub.w sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; And here.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; But not restored here because callee_stack8 did that for us.
+; COMMON-NOT: sub sp,
+
+ ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs.
+ %var = alloca i32, i32 %len
+
+ call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+declare void @Ccallee_stack4([4 x i32], i32)
+
+define fastcc void @fromtail_toC() {
+; COMMON-LABEL: fromtail_toC:
+; COMMON: push {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _Ccallee_stack4
+ ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything.
+; COMMON-NOT: add sp,
+; COMMON-NOT: sub sp,
+; COMMON: str [[TMP]], [sp]{{$}}
+; COMMON: bl _Ccallee_stack4
+; COMMON-NOT: sub sp,
+
+ call void @Ccallee_stack4([4 x i32] undef, i32 42)
+ call void @Ccallee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir b/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
index 104c887b5f7d..8ee4a80067fa 100644
--- a/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
+++ b/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
@@ -41,5 +41,5 @@ body: |
$r1 = COPY %1
$r2 = COPY %2
$r3 = COPY %3
- TCRETURNri killed %5, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
+ TCRETURNri killed %5, 0, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
...
diff --git a/llvm/test/CodeGen/ARM/swifttailcc-call.ll b/llvm/test/CodeGen/ARM/swifttailcc-call.ll
new file mode 100644
index 000000000000..2514e26900ee
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/swifttailcc-call.ll
@@ -0,0 +1,201 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos | FileCheck %s
+
+declare swifttailcc void @callee_stack0()
+declare swifttailcc void @callee_stack4([4 x i32], i32)
+declare swifttailcc void @callee_stack20([4 x i32], [5 x i32])
+declare extern_weak swifttailcc void @callee_weak()
+
+define swifttailcc void @caller_to0_from0() nounwind {
+; CHECK-LABEL: _caller_to0_from0:
+
+ tail call swifttailcc void @callee_stack0()
+ ret void
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: b.w _callee_stack0
+}
+
+define swifttailcc void @caller_to0_from4([4 x i32], i32) {
+; CHECK-LABEL: _caller_to0_from4:
+
+ tail call swifttailcc void @callee_stack0()
+ ret void
+
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack0
+}
+
+define swifttailcc void @caller_to4_from0() {
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+ tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK-LABEL: _caller_to4_from0:
+; CHECK: sub sp, #16
+; CHECK: movs [[TMP:r[0-9]+]], #42
+; CHECK: str [[TMP]], [sp]
+; CHECK-NOT: add sp
+; CHECK: b.w _callee_stack4
+
+}
+
+define swifttailcc void @caller_to4_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to4_from4:
+; CHECK-NOT: sub sp
+; Key point is that the "%a" should go where at SP on entry.
+ tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK: str {{r[0-9]+}}, [sp]
+; CHECK-NOT: add sp
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+define swifttailcc void @caller_to20_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to20_from4:
+; CHECK: sub sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; _callee will not deallocate the space, even in swifttailcc.
+ tail call swifttailcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5])
+
+; CHECK: str {{.*}}, [sp]
+; CHECK: str {{.*}}, [sp, #4]
+; CHECK: str {{.*}}, [sp, #8]
+; CHECK: str {{.*}}, [sp, #12]
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK-NOT: add sp
+; CHECK-NOT: sub sp
+; CHECK: b.w _callee_stack20
+ ret void
+}
+
+
+define swifttailcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: _caller_to4_from24:
+
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+ tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+
+define swifttailcc void @caller_to20_from20([4 x i32], [5 x i32] %a) {
+; CHECK-LABEL: _caller_to20_from20:
+; CHECK-NOT: add sp,
+; CHECK-NOT: sub sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b.w will be wrongly clobbered.
+ tail call swifttailcc void @callee_stack20([4 x i32] undef, [5 x i32] %a)
+ ret void
+
+ ; If these ever get interleaved make sure aliasing slots don't clobber each
+ ; other.
+; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12]
+; CHECK: ldm.w sp,
+; CHECK: stm.w
+; CHECK: strd
+; CHECK-NEXT: b.w _callee_stack20
+}
+
+define swifttailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" {
+; CHECK-LABEL: disable_tail_calls:
+
+ tail call swifttailcc void @callee_stack0()
+ ret void
+
+; CHECK: bl _callee_stack0
+; CHECK: ret
+}
+
+define swifttailcc void @normal_ret_with_stack([4 x i32], i32 %a) {
+; CHECK: _normal_ret_with_stack:
+; CHECK: add sp, #16
+; CHECK: bx lr
+ ret void
+}
+
+declare { [2 x float] } @get_vec2()
+
+define void @fromC_totail() {
+; COMMON-LABEL: fromC_totail:
+; COMMON: puch {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; We must reset the stack to where it was before the call by undoing its extra stack pop.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl callee_stack4
+; COMMON: sub sp, #16
+
+ call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+define void @fromC_totail_noreservedframe(i32 %len) {
+; COMMON-LABEL: fromC_totail_noreservedframe:
+; COMMON: sub.w sp, sp, r{{.*}}
+
+; COMMON: movs [[TMP:r[0-9]+]], #42
+ ; Note stack is subtracted here to allocate space for arg
+; COMMON: sub.w sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; And here.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; But not restored here because callee_stack8 did that for us.
+; COMMON-NOT: sub sp,
+
+ ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs.
+ %var = alloca i32, i32 %len
+
+ call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+declare void @Ccallee_stack4([4 x i32], i32)
+
+define swifttailcc void @fromtail_toC() {
+; COMMON-LABEL: fromtail_toC:
+; COMMON: push {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _Ccallee_stack4
+ ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything.
+; COMMON-NOT: add sp,
+; COMMON-NOT: sub sp,
+; COMMON: str [[TMP]], [sp]{{$}}
+; COMMON: bl _Ccallee_stack4
+; COMMON-NOT: sub sp,
+
+ call void @Ccallee_stack4([4 x i32] undef, i32 42)
+ call void @Ccallee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure)
+define swiftcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) {
+; CHECK-LABEL: CallSwiftSelf:
+; CHECK: push{{.*}}r10
+ %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure)
+ ret i8* %res
+}
diff --git a/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll b/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll
new file mode 100644
index 000000000000..7d6af2d801aa
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -O0 -fast-isel %s -o - | FileCheck %s
+
+declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure)
+
+define swifttailcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) {
+; CHECK-LABEL: CallSwiftSelf:
+; CHECK: bl _SwiftSelf
+; CHECK: pop {r7, pc}
+ %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself null)
+ ret i8* %res
+}
diff --git a/llvm/test/CodeGen/ARM/tailcc-call.ll b/llvm/test/CodeGen/ARM/tailcc-call.ll
new file mode 100644
index 000000000000..ced6f02978dd
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/tailcc-call.ll
@@ -0,0 +1,193 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos | FileCheck %s
+
+declare tailcc void @callee_stack0()
+declare tailcc void @callee_stack4([4 x i32], i32)
+declare tailcc void @callee_stack20([4 x i32], [5 x i32])
+declare extern_weak tailcc void @callee_weak()
+
+define tailcc void @caller_to0_from0() nounwind {
+; CHECK-LABEL: _caller_to0_from0:
+
+ tail call tailcc void @callee_stack0()
+ ret void
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: b.w _callee_stack0
+}
+
+define tailcc void @caller_to0_from4([4 x i32], i32) {
+; CHECK-LABEL: _caller_to0_from4:
+
+ tail call tailcc void @callee_stack0()
+ ret void
+
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack0
+}
+
+define tailcc void @caller_to4_from0() {
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+ tail call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK-LABEL: _caller_to4_from0:
+; CHECK: sub sp, #16
+; CHECK: movs [[TMP:r[0-9]+]], #42
+; CHECK: str [[TMP]], [sp]
+; CHECK-NOT: add sp
+; CHECK: b.w _callee_stack4
+
+}
+
+define tailcc void @caller_to4_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to4_from4:
+; CHECK-NOT: sub sp
+; Key point is that the "%a" should go where at SP on entry.
+ tail call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK: str {{r[0-9]+}}, [sp]
+; CHECK-NOT: add sp
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+define tailcc void @caller_to20_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to20_from4:
+; CHECK: sub sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; _callee will not deallocate the space, even in tailcc.
+ tail call tailcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5])
+
+; CHECK: str {{.*}}, [sp]
+; CHECK: str {{.*}}, [sp, #4]
+; CHECK: str {{.*}}, [sp, #8]
+; CHECK: str {{.*}}, [sp, #12]
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK-NOT: add sp
+; CHECK-NOT: sub sp
+; CHECK: b.w _callee_stack20
+ ret void
+}
+
+
+define tailcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: _caller_to4_from24:
+
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+ tail call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+
+define tailcc void @caller_to20_from20([4 x i32], [5 x i32] %a) {
+; CHECK-LABEL: _caller_to20_from20:
+; CHECK-NOT: add sp,
+; CHECK-NOT: sub sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b.w will be wrongly clobbered.
+ tail call tailcc void @callee_stack20([4 x i32] undef, [5 x i32] %a)
+ ret void
+
+ ; If these ever get interleaved make sure aliasing slots don't clobber each
+ ; other.
+; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12]
+; CHECK: ldm.w sp,
+; CHECK: stm.w
+; CHECK: strd
+; CHECK-NEXT: b.w _callee_stack20
+}
+
+define tailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" {
+; CHECK-LABEL: disable_tail_calls:
+
+ tail call tailcc void @callee_stack0()
+ ret void
+
+; CHECK: bl _callee_stack0
+; CHECK: ret
+}
+
+define tailcc void @normal_ret_with_stack([4 x i32], i32 %a) {
+; CHECK: _normal_ret_with_stack:
+; CHECK: add sp, #16
+; CHECK: bx lr
+ ret void
+}
+
+declare { [2 x float] } @get_vec2()
+
+define void @fromC_totail() {
+; COMMON-LABEL: fromC_totail:
+; COMMON: puch {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; We must reset the stack to where it was before the call by undoing its extra stack pop.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl callee_stack4
+; COMMON: sub sp, #16
+
+ call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+define void @fromC_totail_noreservedframe(i32 %len) {
+; COMMON-LABEL: fromC_totail_noreservedframe:
+; COMMON: sub.w sp, sp, r{{.*}}
+
+; COMMON: movs [[TMP:r[0-9]+]], #42
+ ; Note stack is subtracted here to allocate space for arg
+; COMMON: sub.w sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; And here.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+ ; But not restored here because callee_stack8 did that for us.
+; COMMON-NOT: sub sp,
+
+ ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs.
+ %var = alloca i32, i32 %len
+
+ call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
+
+declare void @Ccallee_stack4([4 x i32], i32)
+
+define tailcc void @fromtail_toC() {
+; COMMON-LABEL: fromtail_toC:
+; COMMON: push {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _Ccallee_stack4
+ ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything.
+; COMMON-NOT: add sp,
+; COMMON-NOT: sub sp,
+; COMMON: str [[TMP]], [sp]{{$}}
+; COMMON: bl _Ccallee_stack4
+; COMMON-NOT: sub sp,
+
+ call void @Ccallee_stack4([4 x i32] undef, i32 42)
+ call void @Ccallee_stack4([4 x i32] undef, i32 42)
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/v8m-tail-call.ll b/llvm/test/CodeGen/ARM/v8m-tail-call.ll
index 7ee80d4b9b96..c683230c3460 100644
--- a/llvm/test/CodeGen/ARM/v8m-tail-call.ll
+++ b/llvm/test/CodeGen/ARM/v8m-tail-call.ll
@@ -41,25 +41,30 @@ declare i32 @h2(i32, i32, i32, i32, i32)
define hidden i32 @f2(i32, i32, i32, i32, i32) {
; CHECK-LABEL: f2:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: mov r4, r3
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: mov r6, r1
+; CHECK-NEXT: ldr r7, [sp, #24]
; CHECK-NEXT: bl g
; CHECK-NEXT: cbz r0, .LBB2_2
; CHECK-NEXT: @ %bb.1:
+; CHECK-NEXT: str r7, [sp, #24]
; CHECK-NEXT: mov r1, r6
; CHECK-NEXT: mov r2, r5
; CHECK-NEXT: mov r3, r4
-; CHECK-NEXT: ldr r4, [sp, #12]
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: ldr r4, [sp, #16]
; CHECK-NEXT: mov lr, r4
-; CHECK-NEXT: pop {r4, r5, r6}
+; CHECK-NEXT: pop {r4, r5, r6, r7}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: b h2
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: mvns r0, r0
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
%6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
%7 = icmp eq i32 %6, 0
br i1 %7, label %10, label %8
More information about the llvm-commits
mailing list