[llvm] d88f96d - ARM: support mandatory tail calls for tailcc & swifttailcc

Tim Northover via llvm-commits llvm-commits at lists.llvm.org
Fri May 28 03:12:14 PDT 2021


Author: Tim Northover
Date: 2021-05-28T11:10:51+01:00
New Revision: d88f96dff3f192fc0c1bf57f7810b95a709b3591

URL: https://github.com/llvm/llvm-project/commit/d88f96dff3f192fc0c1bf57f7810b95a709b3591
DIFF: https://github.com/llvm/llvm-project/commit/d88f96dff3f192fc0c1bf57f7810b95a709b3591.diff

LOG: ARM: support mandatory tail calls for tailcc & swifttailcc

This adds support for callee-pop conventions to the ARM backend so that it can
ensure a call marked "tail" is actually a tail call.

Added: 
    llvm/test/CodeGen/ARM/fastcc-tailcall.ll
    llvm/test/CodeGen/ARM/swifttailcc-call.ll
    llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll
    llvm/test/CodeGen/ARM/tailcc-call.ll

Modified: 
    llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
    llvm/lib/Target/ARM/ARMCallingConv.td
    llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
    llvm/lib/Target/ARM/ARMFastISel.cpp
    llvm/lib/Target/ARM/ARMFrameLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/lib/Target/ARM/ARMInstrInfo.td
    llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
    llvm/lib/Target/ARM/ARMSubtarget.cpp
    llvm/test/CodeGen/ARM/dbg-tcreturn.ll
    llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
    llvm/test/CodeGen/ARM/v8m-tail-call.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 33d5aaf6b8c2..2f7f3eee9844 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -79,6 +79,11 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_NoRegs_SaveList;
   } else if (F.getCallingConv() == CallingConv::CFGuard_Check) {
     return CSR_Win_AAPCS_CFGuard_Check_SaveList;
+  } else if (F.getCallingConv() == CallingConv::SwiftTail) {
+    return STI.isTargetDarwin()
+               ? CSR_iOS_SwiftTail_SaveList
+               : (UseSplitPush ? CSR_AAPCS_SplitPush_SwiftTail_SaveList
+                               : CSR_AAPCS_SwiftTail_SaveList);
   } else if (F.hasFnAttribute("interrupt")) {
     if (STI.isMClass()) {
       // M-class CPUs have hardware which saves the registers needed to allow a
@@ -129,6 +134,10 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_NoRegs_RegMask;
   if (CC == CallingConv::CFGuard_Check)
     return CSR_Win_AAPCS_CFGuard_Check_RegMask;
+  if (CC == CallingConv::SwiftTail) {
+    return STI.isTargetDarwin() ? CSR_iOS_SwiftTail_RegMask
+                                : CSR_AAPCS_SwiftTail_RegMask;
+  }
   if (STI.getTargetLowering()->supportSwiftError() &&
       MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
     return STI.isTargetDarwin() ? CSR_iOS_SwiftError_RegMask

diff  --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
index 3517274e4c5c..a6dbe563a4ab 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -278,6 +278,9 @@ def CSR_Win_AAPCS_CFGuard_Check : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7,
 // R8 is used to pass swifterror, remove it from CSR.
 def CSR_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS, R8)>;
 
+// R10 is used to pass swiftself, remove it from CSR.
+def CSR_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS, R10)>;
+
 // The order of callee-saved registers needs to match the order we actually push
 // them in FrameLowering, because this order is what's used by
 // PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
@@ -290,6 +293,10 @@ def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
 def CSR_AAPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
                                                       R8)>;
 
+// R10 is used to pass swifterror, remove it from CSR.
+def CSR_AAPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_AAPCS_SplitPush,
+                                                     R10)>;
+
 // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
 // and the pointer return value are both passed in R0 in these cases, this can
 // be partially modelled by treating R0 as a callee-saved register
@@ -305,6 +312,9 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 // R8 is used to pass swifterror, remove it from CSR.
 def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>;
 
+// R10 is used to pass swiftself, remove it from CSR.
+def CSR_iOS_SwiftTail : CalleeSavedRegs<(sub CSR_iOS, R10)>;
+
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
 

diff  --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index deea297c02ea..8d6bc063d6ef 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2037,7 +2037,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       }
 
       auto NewMI = std::prev(MBBI);
-      for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+      for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
         NewMI->addOperand(MBBI->getOperand(i));
 
 

diff  --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 73cb3a218827..32c54e8712cf 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -1849,6 +1849,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
     }
   case CallingConv::ARM_AAPCS_VFP:
   case CallingConv::Swift:
+  case CallingConv::SwiftTail:
     if (!isVarArg)
       return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
     // Fall through to soft float variant, variadic functions don't
@@ -3014,6 +3015,7 @@ bool ARMFastISel::fastLowerArguments() {
   case CallingConv::ARM_AAPCS:
   case CallingConv::ARM_APCS:
   case CallingConv::Swift:
+  case CallingConv::SwiftTail:
     break;
   }
 

diff  --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index fea771831b4d..c115cc1e413d 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -237,6 +237,41 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
 }
 
+// Returns how much of the incoming argument stack area we should clean up in an
+// epilogue. For the C calling convention this will be 0, for guaranteed tail
+// call conventions it can be positive (a normal return or a tail call to a
+// function that uses less stack space for arguments) or negative (for a tail
+// call to a function that needs more stack space than us for arguments).
+static int getArgumentStackToRestore(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  bool IsTailCallReturn = false;
+  if (MBB.end() != MBBI) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    IsTailCallReturn = RetOpcode == ARM::TCRETURNdi ||
+                       RetOpcode == ARM::TCRETURNri;
+  }
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  int ArgumentPopSize = 0;
+  if (IsTailCallReturn) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  return ArgumentPopSize;
+}
+
 static void emitRegPlusImmediate(
     bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
     const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
@@ -868,7 +903,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
          "This emitEpilogue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  // Amount of stack space we reserved next to incoming args for either
+  // varargs registers or stack arguments in tail calls made by this function.
+  unsigned ReservedArgStack = AFI->getArgRegsSaveSize();
+
+  // How much of the stack used by incoming arguments this function is expected
+  // to restore in this particular epilogue.
+  int IncomingArgStackToRestore = getArgumentStackToRestore(MF, MBB);
   int NumBytes = (int)MFI.getStackSize();
   Register FramePtr = RegInfo->getFrameRegister(MF);
 
@@ -882,8 +923,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   if (!AFI->hasStackFrame()) {
-    if (NumBytes - ArgRegsSaveSize != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize,
+    if (NumBytes - ReservedArgStack != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ReservedArgStack,
                    MachineInstr::FrameDestroy);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
@@ -897,7 +938,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     // Move SP to start of FP callee save spill area.
-    NumBytes -= (ArgRegsSaveSize +
+    NumBytes -= (ReservedArgStack +
                  AFI->getFPCXTSaveAreaSize() +
                  AFI->getGPRCalleeSavedArea1Size() +
                  AFI->getGPRCalleeSavedArea2Size() +
@@ -969,9 +1010,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (AFI->getFPCXTSaveAreaSize()) MBBI++;
   }
 
-  if (ArgRegsSaveSize)
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize,
+  if (ReservedArgStack || IncomingArgStackToRestore) {
+    assert(ReservedArgStack + IncomingArgStackToRestore >= 0 &&
+           "attempting to restore negative stack amount");
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII,
+                 ReservedArgStack + IncomingArgStackToRestore,
                  MachineInstr::FrameDestroy);
+  }
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -2288,31 +2333,37 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isARM = !AFI->isThumbFunction();
+  DebugLoc dl = I->getDebugLoc();
+  unsigned Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII.getCallFrameDestroyOpcode();
+  unsigned CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This eliminateCallFramePseudoInstr does not support Thumb1!");
+
+  int PIdx = I->findFirstPredOperandIdx();
+  ARMCC::CondCodes Pred = (PIdx == -1)
+                              ? ARMCC::AL
+                              : (ARMCC::CondCodes)I->getOperand(PIdx).getImm();
+  unsigned PredReg = TII.getFramePred(*I);
+
   if (!hasReservedCallFrame(MF)) {
+    // Bail early if the callee is expected to do the adjustment.
+    if (IsDestroy && CalleePopAmount != -1U)
+      return MBB.erase(I);
+
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr &Old = *I;
-    DebugLoc dl = Old.getDebugLoc();
-    unsigned Amount = TII.getFrameSize(Old);
+    unsigned Amount = TII.getFrameSize(*I);
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
       Amount = alignSPAdjust(Amount);
 
-      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-      assert(!AFI->isThumb1OnlyFunction() &&
-             "This eliminateCallFramePseudoInstr does not support Thumb1!");
-      bool isARM = !AFI->isThumbFunction();
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old.getOpcode();
-      int PIdx = Old.findFirstPredOperandIdx();
-      ARMCC::CondCodes Pred =
-          (PIdx == -1) ? ARMCC::AL
-                       : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
-      unsigned PredReg = TII.getFramePred(Old);
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
         emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
@@ -2322,6 +2373,11 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
                      Pred, PredReg);
       }
     }
+  } else if (CalleePopAmount != -1U) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    emitSPUpdate(isARM, MBB, I, dl, TII, -CalleePopAmount,
+                 MachineInstr::NoFlags, Pred, PredReg);
   }
   return MBB.erase(I);
 }

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3f4321b23260..4dee7438c955 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2008,6 +2008,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
   case CallingConv::SwiftTail:
     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
   case CallingConv::C:
+  case CallingConv::Tail:
     if (!Subtarget->isAAPCS_ABI())
       return CallingConv::ARM_APCS;
     else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
@@ -2184,19 +2185,31 @@ SDValue ARMTargetLowering::LowerCallResult(
   return Chain;
 }
 
-/// LowerMemOpCallTo - Store the argument to the stack.
-SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
-                                            SDValue Arg, const SDLoc &dl,
-                                            SelectionDAG &DAG,
-                                            const CCValAssign &VA,
-                                            ISD::ArgFlagsTy Flags) const {
-  unsigned LocMemOffset = VA.getLocMemOffset();
-  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                       StackPtr, PtrOff);
-  return DAG.getStore(
-      Chain, dl, Arg, PtrOff,
-      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
+    const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
+    bool IsTailCall, int SPDiff) const {
+  SDValue DstAddr;
+  MachinePointerInfo DstInfo;
+  int32_t Offset = VA.getLocMemOffset();
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (IsTailCall) {
+        Offset += SPDiff;
+        auto PtrVT = getPointerTy(DAG.getDataLayout());
+        int Size = VA.getLocVT().getFixedSizeInBits() / 8;
+        int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
+        DstInfo =
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+  } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+        DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                              StackPtr, PtrOff);
+        DstInfo =
+            MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
+  }
+
+  return std::make_pair(DstAddr, DstInfo);
 }
 
 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
@@ -2205,7 +2218,8 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
                                          CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &StackPtr,
                                          SmallVectorImpl<SDValue> &MemOpChains,
-                                         ISD::ArgFlagsTy Flags) const {
+                                         bool IsTailCall,
+                                         int SPDiff) const {
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
   unsigned id = Subtarget->isLittle() ? 0 : 1;
@@ -2219,12 +2233,20 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
                                     getPointerTy(DAG.getDataLayout()));
 
-    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
-                                           dl, DAG, NextVA,
-                                           Flags));
+    SDValue DstAddr;
+    MachinePointerInfo DstInfo;
+    std::tie(DstAddr, DstInfo) =
+        computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
+    MemOpChains.push_back(
+        DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
   }
 }
 
+static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
+  return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
+         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
+}
+
 /// LowerCall - Lowering a call into a callseq_start <-
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
@@ -2249,6 +2271,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool isThisReturn = false;
   bool isCmseNSCall   = false;
+  bool isSibCall = false;
   bool PreferIndirect = false;
 
   // Determine whether this is a non-secure function call.
@@ -2288,6 +2311,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
+
+    if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
+        CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
+      isSibCall = true;
+
     // We don't support GuaranteedTailCallOpt for ARM, only automatically
     // detected sibcalls.
     if (isTailCall)
@@ -2303,13 +2331,40 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  if (isTailCall) {
-    // For tail calls, memory operands are available in our caller's stack.
+  // SPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int SPDiff = 0;
+
+  if (isTailCall && !isSibCall) {
+    auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
+    unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    Align StackAlign = DAG.getDataLayout().getStackAlignment();
+    NumBytes = alignTo(NumBytes, StackAlign);
+
+    // SPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    SPDiff = NumReusableBytes - NumBytes;
+
+    // If this call requires more stack than we have available from
+    // LowerFormalArguments, tell FrameLowering to reserve space for it.
+    if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
+      AFI->setArgRegsSaveSize(-SPDiff);
+  }
+
+  if (isSibCall) {
+    // For sibling tail calls, memory operands are available in our caller's stack.
     NumBytes = 0;
   } else {
     // Adjust the stack pointer for the new arguments...
     // These operations are automatically eliminated by the prolog/epilog pass
-    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+    Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
   }
 
   SDValue StackPtr =
@@ -2318,6 +2373,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
+  // During a tail call, stores to the argument area must happen after all of
+  // the function's incoming arguments have been loaded because they may alias.
+  // This is done by folding in a TokenFactor from LowerFormalArguments, but
+  // there's no point in doing so repeatedly so this tracks whether that's
+  // happened yet.
+  bool AfterFormalArgLoads = false;
+
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization, arguments are handled later.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
@@ -2346,6 +2408,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       break;
     }
 
+    if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
+      Chain = DAG.getStackArgumentTokenFactor(Chain);
+      AfterFormalArgLoads = true;
+    }
+
     // f16 arguments have their size extended to 4 bytes and passed as if they
     // had been copied to the LSBs of a 32-bit register.
     // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
@@ -2375,21 +2442,23 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 DAG.getConstant(1, dl, MVT::i32));
 
       PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
-                       StackPtr, MemOpChains, Flags);
+                       StackPtr, MemOpChains, isTailCall, SPDiff);
 
       VA = ArgLocs[++i]; // skip ahead to next loc
       if (VA.isRegLoc()) {
         PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
-                         StackPtr, MemOpChains, Flags);
+                         StackPtr, MemOpChains, isTailCall, SPDiff);
       } else {
         assert(VA.isMemLoc());
-
-        MemOpChains.push_back(
-            LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
+        SDValue DstAddr;
+        MachinePointerInfo DstInfo;
+        std::tie(DstAddr, DstInfo) =
+            computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
       }
     } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
       PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
-                       StackPtr, MemOpChains, Flags);
+                       StackPtr, MemOpChains, isTailCall, SPDiff);
     } else if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
           Outs[0].VT == MVT::i32) {
@@ -2439,9 +2508,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       if (Flags.getByValSize() > 4*offset) {
         auto PtrVT = getPointerTy(DAG.getDataLayout());
-        unsigned LocMemOffset = VA.getLocMemOffset();
-        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-        SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
+        SDValue Dst;
+        MachinePointerInfo DstInfo;
+        std::tie(Dst, DstInfo) =
+            computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
@@ -2454,11 +2524,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
                                           Ops));
       }
-    } else if (!isTailCall) {
+    } else {
       assert(VA.isMemLoc());
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+      std::tie(DstAddr, DstInfo) =
+          computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
 
-      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
-                                             dl, DAG, VA, Flags));
+      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
+      MemOpChains.push_back(Store);
     }
   }
 
@@ -2622,10 +2696,24 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
   }
 
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (isTailCall && !isSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
+  if (isTailCall) {
+    Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
+  }
+
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@@ -2670,8 +2758,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
+  // If we're guaranteeing tail-calls will be honoured, the callee must
+  // pop its own argument stack on return. But this call is *not* a tail call so
+  // we need to undo that after it returns to restore the status-quo.
+  bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+  uint64_t CalleePopBytes =
+      canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
+
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                             DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+                             DAG.getIntPtrConstant(CalleePopBytes, dl, true),
+                             InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -2812,6 +2908,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
   if (CallerF.hasFnAttribute("interrupt"))
     return false;
 
+  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+    return CalleeCC == CallerCC;
+
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
@@ -4460,7 +4559,17 @@ SDValue ARMTargetLowering::LowerFormalArguments(
     }
   }
 
-  AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (canGuaranteeTCO(CallConv, TailCallOpt)) {
+    // The only way to guarantee a tail call is if the callee restores its
+    // argument area, but it must also keep the stack aligned when doing so.
+    const DataLayout &DL = DAG.getDataLayout();
+    StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
+
+    AFI->setArgumentStackToRestore(StackArgSize);
+  }
+  AFI->setArgumentStackSize(StackArgSize);
 
   if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
     DiagnosticInfoUnsupported Diag(

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 80e4e12c702e..5b4a96d93cab 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -756,7 +756,8 @@ class VectorType;
                           CCValAssign &VA, CCValAssign &NextVA,
                           SDValue &StackPtr,
                           SmallVectorImpl<SDValue> &MemOpChains,
-                          ISD::ArgFlagsTy Flags) const;
+                          bool IsTailCall,
+                          int SPDiff) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
                                  const SDLoc &dl) const;
@@ -765,10 +766,10 @@ class VectorType;
                                             bool isVarArg) const;
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
-    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             const SDLoc &dl, SelectionDAG &DAG,
-                             const CCValAssign &VA,
-                             ISD::ArgFlagsTy Flags) const;
+    std::pair<SDValue, MachinePointerInfo>
+    computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG,
+                          const CCValAssign &VA, SDValue StackPtr,
+                          bool IsTailCall, int SPDiff) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 6b61c598f572..65c7fb5ca118 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -66,7 +66,7 @@ def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
                                            SDTCisInt<1>]>;
 
-def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARMTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
 
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
@@ -2629,10 +2629,10 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 // Tail calls.
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, i32imm:$SPDiff), IIC_Br, []>,
                    Sched<[WriteBr]>;
 
-  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, i32imm:$SPDiff), IIC_Br, []>,
                    Sched<[WriteBr]>;
 
   def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
@@ -6003,9 +6003,12 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst),
 // TODO: add,sub,and, 3-instr forms?
 
 // Tail calls. These patterns also apply to Thumb mode.
-def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
-def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret tcGPR:$dst, (i32 timm:$SPDiff)),
+          (TCRETURNri tcGPR:$dst, timm:$SPDiff)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst), (i32 timm:$SPDiff)),
+          (TCRETURNdi texternalsym:$dst, (i32 timm:$SPDiff))>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst), (i32 timm:$SPDiff)),
+          (TCRETURNdi texternalsym:$dst, i32imm:$SPDiff)>;
 
 // Direct calls
 def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;

diff  --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 298c8a238987..851655284060 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -43,7 +43,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// "attach" GPR-part to the part that was passed via stack.
   unsigned StByValParamsPadding = 0;
 
-  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  /// ArgsRegSaveSize - Size of the register save area for vararg functions or
+  /// those making guaranteed tail calls that need more stack argument space
+  /// than is provided by this functions incoming parameters.
   ///
   unsigned ArgRegsSaveSize = 0;
 
@@ -118,6 +120,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// being passed on the stack
   unsigned ArgumentStackSize = 0;
 
+  /// ArgumentStackToRestore - amount of bytes on stack consumed that we must
+  /// restore on return.
+  unsigned ArgumentStackToRestore = 0;
+
   /// CoalescedWeights - mapping of basic blocks to the rolling counter of
   /// coalesced weights.
   DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
@@ -195,6 +201,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   unsigned getArgumentStackSize() const { return ArgumentStackSize; }
   void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
 
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned v) { ArgumentStackToRestore = v; }
+
   void initPICLabelUId(unsigned UId) {
     PICLabelUId = UId;
   }

diff  --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 5cb608b74ace..90f1b693fec6 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -230,7 +230,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // registers are the 4 used for parameters.  We don't currently do this
   // case.
 
-  SupportsTailCall = !isThumb() || hasV8MBaselineOps();
+  SupportsTailCall = !isThumb1Only() || hasV8MBaselineOps();
 
   if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
     SupportsTailCall = false;

diff  --git a/llvm/test/CodeGen/ARM/dbg-tcreturn.ll b/llvm/test/CodeGen/ARM/dbg-tcreturn.ll
index d4061be98180..037fda116f38 100644
--- a/llvm/test/CodeGen/ARM/dbg-tcreturn.ll
+++ b/llvm/test/CodeGen/ARM/dbg-tcreturn.ll
@@ -12,7 +12,7 @@ target triple = "thumbv7-apple-ios7.0.0"
 ; CHECK-NEXT:     $r0 = COPY %0
 ; CHECK-NEXT:     $r1 = COPY %1
 ; CHECK-NEXT:     DBG_VALUE $noreg, $noreg, !13, !DIExpression(), debug-location !16
-; CHECK-NEXT:     TCRETURNdi &__divsi3, implicit $sp, implicit $r0, implicit $r1
+; CHECK-NEXT:     TCRETURNdi &__divsi3, 0, implicit $sp, implicit $r0, implicit $r1
 
 define i32 @test(i32 %a1, i32 %a2) !dbg !5 {
 entry:

diff  --git a/llvm/test/CodeGen/ARM/fastcc-tailcall.ll b/llvm/test/CodeGen/ARM/fastcc-tailcall.ll
new file mode 100644
index 000000000000..fc0717278b7e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/fastcc-tailcall.ll
@@ -0,0 +1,193 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos -tailcallopt | FileCheck %s
+
+declare fastcc void @callee_stack0()
+declare fastcc void @callee_stack4([4 x i32], i32)
+declare fastcc void @callee_stack20([4 x i32], [5 x i32])
+declare extern_weak fastcc void @callee_weak()
+
+define fastcc void @caller_to0_from0() nounwind {
+; CHECK-LABEL: _caller_to0_from0:
+
+  tail call fastcc void @callee_stack0()
+  ret void
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: b.w _callee_stack0
+}
+
+define fastcc void @caller_to0_from4([4 x i32], i32) {
+; CHECK-LABEL: _caller_to0_from4:
+
+  tail call fastcc void @callee_stack0()
+  ret void
+
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack0
+}
+
+define fastcc void @caller_to4_from0() {
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+  tail call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK-LABEL: _caller_to4_from0:
+; CHECK: sub sp, #16
+; CHECK: movs [[TMP:r[0-9]+]], #42
+; CHECK: str [[TMP]], [sp]
+; CHECK-NOT: add sp
+; CHECK: b.w _callee_stack4
+
+}
+
+define fastcc void @caller_to4_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to4_from4:
+; CHECK-NOT: sub sp
+; Key point is that the "%a" should go where at SP on entry.
+  tail call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK: str {{r[0-9]+}}, [sp]
+; CHECK-NOT: add sp
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+define fastcc void @caller_to20_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to20_from4:
+; CHECK: sub sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; _callee will not deallocate the space, even in fastcc.
+  tail call fastcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5])
+
+; CHECK: str {{.*}}, [sp]
+; CHECK: str {{.*}}, [sp, #4]
+; CHECK: str {{.*}}, [sp, #8]
+; CHECK: str {{.*}}, [sp, #12]
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK-NOT: add sp
+; CHECK-NOT: sub sp
+; CHECK: b.w _callee_stack20
+  ret void
+}
+
+
+define fastcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: _caller_to4_from24:
+
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+  tail call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+
+define fastcc void @caller_to20_from20([4 x i32], [5 x i32] %a) {
+; CHECK-LABEL: _caller_to20_from20:
+; CHECK-NOT: add sp,
+; CHECK-NOT: sub sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b.w will be wrongly clobbered.
+  tail call fastcc void @callee_stack20([4 x i32] undef, [5 x i32] %a)
+  ret void
+
+  ; If these ever get interleaved make sure aliasing slots don't clobber each
+  ; other.
+; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12]
+; CHECK: ldm.w sp,
+; CHECK: stm.w
+; CHECK: strd
+; CHECK-NEXT: b.w _callee_stack20
+}
+
+define fastcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" {
+; CHECK-LABEL: disable_tail_calls:
+
+  tail call fastcc void @callee_stack0()
+  ret void
+
+; CHECK: bl _callee_stack0
+; CHECK: ret
+}
+
+define fastcc void @normal_ret_with_stack([4 x i32], i32 %a) {
+; CHECK: _normal_ret_with_stack:
+; CHECK: add sp, #16
+; CHECK: bx lr
+  ret void
+}
+
+declare { [2 x float] } @get_vec2()
+
+define void @fromC_totail() {
+; COMMON-LABEL: fromC_totail:
+; COMMON: puch {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; We must reset the stack to where it was before the call by undoing its extra stack pop.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl callee_stack4
+; COMMON: sub sp, #16
+
+  call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+define void @fromC_totail_noreservedframe(i32 %len) {
+; COMMON-LABEL: fromC_totail_noreservedframe:
+; COMMON: sub.w sp, sp, r{{.*}}
+
+; COMMON: movs [[TMP:r[0-9]+]], #42
+  ; Note stack is subtracted here to allocate space for arg
+; COMMON: sub.w sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; And here.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; But not restored here because callee_stack8 did that for us.
+; COMMON-NOT: sub sp,
+
+  ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs.
+  %var = alloca i32, i32 %len
+
+  call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  call fastcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+declare void @Ccallee_stack4([4 x i32], i32)
+
+define fastcc void @fromtail_toC() {
+; COMMON-LABEL: fromtail_toC:
+; COMMON: push {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _Ccallee_stack4
+  ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything.
+; COMMON-NOT: add sp,
+; COMMON-NOT: sub sp,
+; COMMON: str [[TMP]], [sp]{{$}}
+; COMMON: bl _Ccallee_stack4
+; COMMON-NOT: sub sp,
+
+  call void @Ccallee_stack4([4 x i32] undef, i32 42)
+  call void @Ccallee_stack4([4 x i32] undef, i32 42)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir b/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
index 104c887b5f7d..8ee4a80067fa 100644
--- a/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
+++ b/llvm/test/CodeGen/ARM/peephole-callee-save-regalloc.mir
@@ -41,5 +41,5 @@ body:             |
     $r1 = COPY %1
     $r2 = COPY %2
     $r3 = COPY %3
-    TCRETURNri killed %5, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
+    TCRETURNri killed %5, 0, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3
 ...

diff  --git a/llvm/test/CodeGen/ARM/swifttailcc-call.ll b/llvm/test/CodeGen/ARM/swifttailcc-call.ll
new file mode 100644
index 000000000000..2514e26900ee
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/swifttailcc-call.ll
@@ -0,0 +1,201 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos | FileCheck %s
+
+declare swifttailcc void @callee_stack0()
+declare swifttailcc void @callee_stack4([4 x i32], i32)
+declare swifttailcc void @callee_stack20([4 x i32], [5 x i32])
+declare extern_weak swifttailcc void @callee_weak()
+
+define swifttailcc void @caller_to0_from0() nounwind {
+; CHECK-LABEL: _caller_to0_from0:
+
+  tail call swifttailcc void @callee_stack0()
+  ret void
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: b.w _callee_stack0
+}
+
+define swifttailcc void @caller_to0_from4([4 x i32], i32) {
+; CHECK-LABEL: _caller_to0_from4:
+
+  tail call swifttailcc void @callee_stack0()
+  ret void
+
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack0
+}
+
+define swifttailcc void @caller_to4_from0() {
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+  tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK-LABEL: _caller_to4_from0:
+; CHECK: sub sp, #16
+; CHECK: movs [[TMP:r[0-9]+]], #42
+; CHECK: str [[TMP]], [sp]
+; CHECK-NOT: add sp
+; CHECK: b.w _callee_stack4
+
+}
+
+define swifttailcc void @caller_to4_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to4_from4:
+; CHECK-NOT: sub sp
+; Key point is that the "%a" should go where at SP on entry.
+  tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK: str {{r[0-9]+}}, [sp]
+; CHECK-NOT: add sp
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+define swifttailcc void @caller_to20_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to20_from4:
+; CHECK: sub sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; _callee will not deallocate the space, even in swifttailcc.
+  tail call swifttailcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5])
+
+; CHECK: str {{.*}}, [sp]
+; CHECK: str {{.*}}, [sp, #4]
+; CHECK: str {{.*}}, [sp, #8]
+; CHECK: str {{.*}}, [sp, #12]
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK-NOT: add sp
+; CHECK-NOT: sub sp
+; CHECK: b.w _callee_stack20
+  ret void
+}
+
+
+define swifttailcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: _caller_to4_from24:
+
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+  tail call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+
+define swifttailcc void @caller_to20_from20([4 x i32], [5 x i32] %a) {
+; CHECK-LABEL: _caller_to20_from20:
+; CHECK-NOT: add sp,
+; CHECK-NOT: sub sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b.w will be wrongly clobbered.
+  tail call swifttailcc void @callee_stack20([4 x i32] undef, [5 x i32] %a)
+  ret void
+
+  ; If these ever get interleaved make sure aliasing slots don't clobber each
+  ; other.
+; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12]
+; CHECK: ldm.w sp,
+; CHECK: stm.w
+; CHECK: strd
+; CHECK-NEXT: b.w _callee_stack20
+}
+
+define swifttailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" {
+; CHECK-LABEL: disable_tail_calls:
+
+  tail call swifttailcc void @callee_stack0()
+  ret void
+
+; CHECK: bl _callee_stack0
+; CHECK: ret
+}
+
+define swifttailcc void @normal_ret_with_stack([4 x i32], i32 %a) {
+; CHECK: _normal_ret_with_stack:
+; CHECK: add sp, #16
+; CHECK: bx lr
+  ret void
+}
+
+declare { [2 x float] } @get_vec2()
+
+define void @fromC_totail() {
+; COMMON-LABEL: fromC_totail:
+; COMMON: puch {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; We must reset the stack to where it was before the call by undoing its extra stack pop.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl callee_stack4
+; COMMON: sub sp, #16
+
+  call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+define void @fromC_totail_noreservedframe(i32 %len) {
+; COMMON-LABEL: fromC_totail_noreservedframe:
+; COMMON: sub.w sp, sp, r{{.*}}
+
+; COMMON: movs [[TMP:r[0-9]+]], #42
+  ; Note stack is subtracted here to allocate space for arg
+; COMMON: sub.w sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; And here.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; But not restored here because callee_stack8 did that for us.
+; COMMON-NOT: sub sp,
+
+  ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs.
+  %var = alloca i32, i32 %len
+
+  call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  call swifttailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+declare void @Ccallee_stack4([4 x i32], i32)
+
+define swifttailcc void @fromtail_toC() {
+; COMMON-LABEL: fromtail_toC:
+; COMMON: push {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _Ccallee_stack4
+  ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything.
+; COMMON-NOT: add sp,
+; COMMON-NOT: sub sp,
+; COMMON: str [[TMP]], [sp]{{$}}
+; COMMON: bl _Ccallee_stack4
+; COMMON-NOT: sub sp,
+
+  call void @Ccallee_stack4([4 x i32] undef, i32 42)
+  call void @Ccallee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure)
+define swiftcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) {
+; CHECK-LABEL: CallSwiftSelf:
+; CHECK: push{{.*}}r10
+  %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure)
+  ret i8* %res
+}

diff  --git a/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll b/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll
new file mode 100644
index 000000000000..7d6af2d801aa
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/swifttailcc-fastisel.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -O0 -fast-isel %s -o - | FileCheck %s
+
+declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure)
+
+define swifttailcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) {
+; CHECK-LABEL: CallSwiftSelf:
+; CHECK: bl _SwiftSelf
+; CHECK: pop {r7, pc}
+  %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself null)
+  ret i8* %res
+}

diff  --git a/llvm/test/CodeGen/ARM/tailcc-call.ll b/llvm/test/CodeGen/ARM/tailcc-call.ll
new file mode 100644
index 000000000000..ced6f02978dd
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/tailcc-call.ll
@@ -0,0 +1,193 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7k-apple-watchos | FileCheck %s
+
+declare tailcc void @callee_stack0()
+declare tailcc void @callee_stack4([4 x i32], i32)
+declare tailcc void @callee_stack20([4 x i32], [5 x i32])
+declare extern_weak tailcc void @callee_weak()
+
+define tailcc void @caller_to0_from0() nounwind {
+; CHECK-LABEL: _caller_to0_from0:
+
+  tail call tailcc void @callee_stack0()
+  ret void
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: b.w _callee_stack0
+}
+
+define tailcc void @caller_to0_from4([4 x i32], i32) {
+; CHECK-LABEL: _caller_to0_from4:
+
+  tail call tailcc void @callee_stack0()
+  ret void
+
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack0
+}
+
+define tailcc void @caller_to4_from0() {
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+  tail call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK-LABEL: _caller_to4_from0:
+; CHECK: sub sp, #16
+; CHECK: movs [[TMP:r[0-9]+]], #42
+; CHECK: str [[TMP]], [sp]
+; CHECK-NOT: add sp
+; CHECK: b.w _callee_stack4
+
+}
+
+define tailcc void @caller_to4_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to4_from4:
+; CHECK-NOT: sub sp
+; Key point is that the "%a" should go where at SP on entry.
+  tail call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK: str {{r[0-9]+}}, [sp]
+; CHECK-NOT: add sp
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+define tailcc void @caller_to20_from4([4 x i32], i32 %a) {
+; CHECK-LABEL: _caller_to20_from4:
+; CHECK: sub sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; _callee will not deallocate the space, even in tailcc.
+  tail call tailcc void @callee_stack20([4 x i32] undef, [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5])
+
+; CHECK: str {{.*}}, [sp]
+; CHECK: str {{.*}}, [sp, #4]
+; CHECK: str {{.*}}, [sp, #8]
+; CHECK: str {{.*}}, [sp, #12]
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK-NOT: add sp
+; CHECK-NOT: sub sp
+; CHECK: b.w _callee_stack20
+  ret void
+}
+
+
+define tailcc void @caller_to4_from24([4 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: _caller_to4_from24:
+
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+  tail call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+
+; CHECK: str {{.*}}, [sp, #16]
+; CHECK: add sp, #16
+; CHECK-NEXT: b.w _callee_stack4
+}
+
+
+define tailcc void @caller_to20_from20([4 x i32], [5 x i32] %a) {
+; CHECK-LABEL: _caller_to20_from20:
+; CHECK-NOT: add sp,
+; CHECK-NOT: sub sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b.w will be wrongly clobbered.
+  tail call tailcc void @callee_stack20([4 x i32] undef, [5 x i32] %a)
+  ret void
+
+  ; If these ever get interleaved make sure aliasing slots don't clobber each
+  ; other.
+; CHECK: ldrd {{.*}}, {{.*}}, [sp, #12]
+; CHECK: ldm.w sp,
+; CHECK: stm.w
+; CHECK: strd
+; CHECK-NEXT: b.w _callee_stack20
+}
+
+define tailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" {
+; CHECK-LABEL: disable_tail_calls:
+
+  tail call tailcc void @callee_stack0()
+  ret void
+
+; CHECK: bl _callee_stack0
+; CHECK: ret
+}
+
+define tailcc void @normal_ret_with_stack([4 x i32], i32 %a) {
+; CHECK: _normal_ret_with_stack:
+; CHECK: add sp, #16
+; CHECK: bx lr
+  ret void
+}
+
+declare { [2 x float] } @get_vec2()
+
+define void @fromC_totail() {
+; COMMON-LABEL: fromC_totail:
+; COMMON: puch {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; We must reset the stack to where it was before the call by undoing its extra stack pop.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl callee_stack4
+; COMMON: sub sp, #16
+
+  call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+define void @fromC_totail_noreservedframe(i32 %len) {
+; COMMON-LABEL: fromC_totail_noreservedframe:
+; COMMON: sub.w sp, sp, r{{.*}}
+
+; COMMON: movs [[TMP:r[0-9]+]], #42
+  ; Note stack is subtracted here to allocate space for arg
+; COMMON: sub.w sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; And here.
+; COMMON: sub sp, #16
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _callee_stack4
+  ; But not restored here because callee_stack8 did that for us.
+; COMMON-NOT: sub sp,
+
+  ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs.
+  %var = alloca i32, i32 %len
+
+  call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  call tailcc void @callee_stack4([4 x i32] undef, i32 42)
+  ret void
+}
+
+declare void @Ccallee_stack4([4 x i32], i32)
+
+define tailcc void @fromtail_toC() {
+; COMMON-LABEL: fromtail_toC:
+; COMMON: push {r4, lr}
+; COMMON: sub sp, #8
+
+; COMMON-NOT: sub sp,
+; COMMON: movs [[TMP:r[0-9]+]], #42
+; COMMON: str [[TMP]], [sp]
+; COMMON: bl _Ccallee_stack4
+  ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything.
+; COMMON-NOT: add sp,
+; COMMON-NOT: sub sp,
+; COMMON: str [[TMP]], [sp]{{$}}
+; COMMON: bl _Ccallee_stack4
+; COMMON-NOT: sub sp,
+
+  call void @Ccallee_stack4([4 x i32] undef, i32 42)
+  call void @Ccallee_stack4([4 x i32] undef, i32 42)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/ARM/v8m-tail-call.ll b/llvm/test/CodeGen/ARM/v8m-tail-call.ll
index 7ee80d4b9b96..c683230c3460 100644
--- a/llvm/test/CodeGen/ARM/v8m-tail-call.ll
+++ b/llvm/test/CodeGen/ARM/v8m-tail-call.ll
@@ -41,25 +41,30 @@ declare i32 @h2(i32, i32, i32, i32, i32)
 define hidden i32 @f2(i32, i32, i32, i32, i32) {
 ; CHECK-LABEL: f2:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    ldr r7, [sp, #24]
 ; CHECK-NEXT:    bl g
 ; CHECK-NEXT:    cbz r0, .LBB2_2
 ; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:    str r7, [sp, #24]
 ; CHECK-NEXT:    mov r1, r6
 ; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    ldr r4, [sp, #12]
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    ldr r4, [sp, #16]
 ; CHECK-NEXT:    mov lr, r4
-; CHECK-NEXT:    pop {r4, r5, r6}
+; CHECK-NEXT:    pop {r4, r5, r6, r7}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    b h2
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    mvns r0, r0
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
   %6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
   %7 = icmp eq i32 %6, 0
   br i1 %7, label %10, label %8


        


More information about the llvm-commits mailing list