[llvm] r310753 - AMDGPU: Start adding tail call support

Fri Aug 11 13:42:08 PDT 2017

Author: arsenm
Date: Fri Aug 11 13:42:08 2017
New Revision: 310753

URL: http://llvm.org/viewvc/llvm-project?rev=310753&view=rev
Log:
AMDGPU: Start adding tail call support

Handle the sibling call cases.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/trunk/test/CodeGen/AMDGPU/tail-call-cgp.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
    llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp Fri Aug 11 13:42:08 2017
@@ -631,10 +631,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo
       }
 
       if (MI.isCall()) {
-        assert(MI.getOpcode() == AMDGPU::SI_CALL);
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?
-        const Function *Callee = cast<Function>(MI.getOperand(2).getGlobal());
+
+        const MachineOperand *CalleeOp
+          = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+        const Function *Callee = cast<Function>(CalleeOp->getGlobal());
         if (Callee->isDeclaration()) {
           // If this is a call to an external function, we can't do much. Make
           // conservative guesses.

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Fri Aug 11 13:42:08 2017
@@ -1001,6 +1001,42 @@ CCAssignFn *AMDGPUTargetLowering::CCAssi
   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
 }
 
+SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
+                                                  SelectionDAG &DAG,
+                                                  MachineFrameInfo &MFI,
+                                                  int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U) {
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+      }
+    }
+  }
+
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
                                                  SmallVectorImpl<SDValue> &InVals,
                                                  StringRef Reason) const {
@@ -3658,6 +3694,7 @@ const char* AMDGPUTargetLowering::getTar
   NODE_NAME_CASE(ELSE)
   NODE_NAME_CASE(LOOP)
   NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_FLAG)
   NODE_NAME_CASE(RETURN_TO_EPILOG)

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Fri Aug 11 13:42:08 2017
@@ -172,6 +172,11 @@ public:
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
+  SDValue addTokenForArgument(SDValue Chain,
+                              SelectionDAG &DAG,
+                              MachineFrameInfo &MFI,
+                              int ClobberedFI) const;
+
   SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals,
                              StringRef Reason) const;
@@ -291,6 +296,7 @@ enum NodeType : unsigned {
 
   // Function call.
   CALL,
+  TC_RETURN,
   TRAP,
 
   // Masked control flow nodes.

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td Fri Aug 11 13:42:08 2017
@@ -74,6 +74,8 @@ def AMDGPUAddeSubeOp : SDTypeProfile<2,
   [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
 >;
 
+def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
@@ -98,6 +100,10 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL
   SDNPVariadic]
 >;
 
+def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
   SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
     [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp Fri Aug 11 13:42:08 2017
@@ -146,6 +146,9 @@ void AMDGPUMCInstLower::lower(const Mach
     OutMI.addOperand(Dest);
     OutMI.addOperand(Src);
     return;
+  } else if (Opcode == AMDGPU::SI_TCRETURN) {
+    // TODO: How to use branch immediate and avoid register+add?
+    Opcode = AMDGPU::S_SETPC_B64;
   }
 
   int MCOpcode = TII->pseudoToMCOpcode(Opcode);

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Aug 11 13:42:08 2017
@@ -32,6 +32,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -84,6 +85,10 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "si-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
 static cl::opt<bool> EnableVGPRIndexMode(
   "amdgpu-vgpr-index-mode",
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
@@ -1647,6 +1652,9 @@ SDValue SITargetLowering::LowerFormalArg
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
   ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
 
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  Info->setBytesInStackArgArea(StackArgSize);
+
   return Chains.empty() ? Chain :
     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
@@ -1955,6 +1963,103 @@ void SITargetLowering::passSpecialInputs
   }
 }
 
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+  return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::C:
+    return true;
+  default:
+    return canGuaranteeTCO(CC);
+  }
+}
+
+bool SITargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  if (!mayTailCallThisCC(CalleeCC))
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+
+  // Kernels aren't callable, and don't have a live in return address so it
+  // doesn't make sense to do a tail call with entry functions.
+  if (!CallerPreserved)
+    return false;
+
+  bool CCMatch = CallerCC == CalleeCC;
+
+  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+    if (canGuaranteeTCO(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // TODO: Can we handle var args?
+  if (IsVarArg)
+    return false;
+
+  for (const Argument &Arg : CallerF->args()) {
+    if (Arg.hasByValAttr())
+      return false;
+  }
+
+  LLVMContext &Ctx = *DAG.getContext();
+
+  // Check that the call results are passed in the same way.
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
+                                  CCAssignFnForCall(CalleeCC, IsVarArg),
+                                  CCAssignFnForCall(CallerCC, IsVarArg)))
+    return false;
+
+  // The callee has to preserve all registers the caller needs to preserve.
+  if (!CCMatch) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+      return false;
+  }
+
+  // Nothing more to check if the callee is taking no arguments.
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
+
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  // If the stack arguments for this call do not fit into our own save area then
+  // the call cannot be made tail.
+  // TODO: Is this really necessary?
+  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
+}
+
+bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+
+  const Function *ParentFn = CI->getParent()->getParent();
+  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
+    return false;
+
+  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
+  return (Attr.getValueAsString() != "true");
+}
+
 // The wave scratch offset register is used as the global base pointer.
 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     SmallVectorImpl<SDValue> &InVals) const {
@@ -1987,8 +2092,27 @@ SDValue SITargetLowering::LowerCall(Call
                               "unsupported required tail call to function ");
   }
 
-  // TODO: Implement tail calls.
-  IsTailCall = false;
+  // The first 4 bytes are reserved for the callee's emergency stack slot.
+  const unsigned CalleeUsableStackOffset = 4;
+
+  if (IsTailCall) {
+    IsTailCall = isEligibleForTailCallOptimization(
+      Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+    }
+
+    bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
+    if (IsTailCall)
+      ++NumTailCalls;
+  }
 
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
     // FIXME: Remove this hack for function pointer types.
@@ -2020,8 +2144,8 @@ SDValue SITargetLowering::LowerCall(Call
   // by this amount for a tail call. In a sibling call it must be 0 because the
   // caller will deallocate the entire stack and the callee still expects its
   // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int FPDiff = 0;
-
+  int32_t FPDiff = 0;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
   // Adjust the stack pointer for the new arguments...
@@ -2044,9 +2168,7 @@ SDValue SITargetLowering::LowerCall(Call
 
   // Stack pointer relative accesses are done by changing the offset SGPR. This
   // is just the VGPR offset component.
-
-  // The first 4 bytes are reserved for the callee's emergency stack slot.
-  SDValue StackPtr = DAG.getConstant(4, DL, MVT::i32);
+  SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
 
   SmallVector<SDValue, 8> MemOpChains;
   MVT PtrVT = MVT::i32;
@@ -2093,10 +2215,28 @@ SDValue SITargetLowering::LowerCall(Call
       SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
 
-      if (!IsTailCall) {
-        SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
-
-        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+      if (IsTailCall) {
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        unsigned OpSize = Flags.isByVal() ?
+          Flags.getByValSize() : VA.getValVT().getStoreSize();
+
+        Offset = Offset + FPDiff;
+        int FI = MFI.CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
+        DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
+        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+
+        // FIXME: Why is this really necessary? This seems to just result in a
+        // lot of code to copy the stack and write them back to the same
+        // locations, which are supposed to be immutable?
+        Chain = addTokenForArgument(Chain, DAG, MFI, FI);
+      } else {
+        DstAddr = PtrOff;
         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
       }
 
@@ -2132,6 +2272,22 @@ SDValue SITargetLowering::LowerCall(Call
     InFlag = Chain.getValue(1);
   }
 
+
+  SDValue PhysReturnAddrReg;
+  if (IsTailCall) {
+    // Since the return is being combined with the call, we need to pass on the
+    // return address.
+
+    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+    SDValue ReturnAddrReg = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+    PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+                                        MVT::i64);
+    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   // We don't usually want to end the call-sequence here because we would tidy
   // the frame up *after* the call, however in the ABI-changing tail-call case
   // we've carefully laid out the parameters so that when sp is reset they'll be
@@ -2153,6 +2309,8 @@ SDValue SITargetLowering::LowerCall(Call
     // this information must travel along with the operation for eventual
     // consumption by emitEpilogue.
     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+
+    Ops.push_back(PhysReturnAddrReg);
   }
 
   // Add argument registers to the end of the list so that they are known live
@@ -2177,8 +2335,8 @@ SDValue SITargetLowering::LowerCall(Call
   // If we're doing a tall call, use a TC_RETURN here rather than an
   // actual call instruction.
   if (IsTailCall) {
-    MF.getFrameInfo().setHasTailCall();
-    llvm_unreachable("not implemented");
+    MFI.setHasTailCall();
+    return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
   }
 
   // Returns a chain and a flag for retval copy to use.
@@ -2873,7 +3031,8 @@ MachineBasicBlock *SITargetLowering::Emi
         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
     return BB;
   }
-  case AMDGPU::SI_CALL_ISEL: {
+  case AMDGPU::SI_CALL_ISEL:
+  case AMDGPU::SI_TCRETURN_ISEL: {
     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
@@ -2885,17 +3044,24 @@ MachineBasicBlock *SITargetLowering::Emi
 
     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
 
-    MachineInstrBuilder MIB =
-      BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
-      .add(MI.getOperand(0))
-      .addGlobalAddress(G);
+    MachineInstrBuilder MIB;
+    if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
+      MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
+        .add(MI.getOperand(0))
+        .addGlobalAddress(G);
+    } else {
+      MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
+        .add(MI.getOperand(0))
+        .addGlobalAddress(G);
+
+      // There is an additional imm operand for tcreturn, but it should be in the
+      // right place already.
+    }
 
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
       MIB.add(MI.getOperand(I));
 
-
     MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
     MI.eraseFromParent();
     return BB;
   }

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Fri Aug 11 13:42:08 2017
@@ -224,6 +224,15 @@ public:
                           const SDLoc &DL, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
                           SDValue ThisVal) const;
+
+  bool mayBeEmittedAsTailCall(const CallInst *) const override;
+
+  bool isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Fri Aug 11 13:42:08 2017
@@ -378,6 +378,31 @@ def SI_CALL : SPseudoInstSI <
   (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
   let Size = 4;
   let isCall = 1;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteBranch];
+}
+
+// Tail call handling pseudo
+def SI_TCRETURN_ISEL : SPseudoInstSI<(outs),
+  (ins SSrc_b64:$src0, i32imm:$fpdiff),
+  [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> {
+  let isCall = 1;
+  let isTerminator = 1;
+  let isReturn = 1;
+  let isBarrier = 1;
+  let SchedRW = [WriteBranch];
+  let usesCustomInserter = 1;
+}
+
+def SI_TCRETURN : SPseudoInstSI <
+  (outs),
+  (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+  let Size = 4;
+  let isCall = 1;
+  let isTerminator = 1;
+  let isReturn = 1;
+  let isBarrier = 1;
+  let UseNamedOperandTable = 1;
   let SchedRW = [WriteBranch];
 }
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h?rev=310753&r1=310752&r2=310753&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h Fri Aug 11 13:42:08 2017
@@ -110,6 +110,17 @@ class SIMachineFunctionInfo final : publ
   unsigned PSInputAddr = 0;
   unsigned PSInputEnable = 0;
 
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea = 0;
+
   bool ReturnsVoid = true;
 
   // A pair of default/requested minimum/maximum flat work group sizes.
@@ -235,6 +246,14 @@ public:
   unsigned getTIDReg() const { return TIDReg; }
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
 
+  unsigned getBytesInStackArgArea() const {
+    return BytesInStackArgArea;
+  }
+
+  void setBytesInStackArgArea(unsigned Bytes) {
+    BytesInStackArgArea = Bytes;
+  }
+
   // Add user SGPRs.
   unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
   unsigned addDispatchPtr(const SIRegisterInfo &TRI);

Added: llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll?rev=310753&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll Fri Aug 11 13:42:08 2017
@@ -0,0 +1,225 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
+
+; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: s_setpc_b64
+define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
+  %add0 = add i32 %arg0, %arg1
+  ret i32 %add0
+}
+
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
+define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
+; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24
+; GCN: s_setpc_b64
+define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %alloca = alloca [16 x i32], align 4
+  %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
+  store volatile i32 9, i32* %gep
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
+define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
+  ret void
+}
+
+; It doesn't make sense to do a tail from a kernel
+; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
+;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
+  ret void
+}
+
+; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
+; GCN: s_waitcnt
+; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) #1 {
+  %arg1.load = load i32, i32* %arg1, align 4
+  %add0 = add i32 %arg0, %arg1.load
+  ret i32 %add0
+}
+
+; Tail call disallowed with byval in parent.
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: s_swappc_b64
+; GCN: s_setpc_b64
+define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32* byval %b.byval, i32 %c) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* %b.byval)
+  ret i32 %ret
+}
+
+; Tail call disallowed with byval in parent, not callee.
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
+; GCN-NOT: v0
+; GCN-NOT: s32
+; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_setpc_b64
+define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* inttoptr (i32 16 to i32*))
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: v_add_i32_e32 v0, vcc, v1, v0
+; GCN: v_add_i32_e32 v0, vcc, [[LOAD_0]], v0
+; GCN: v_add_i32_e32 v0, vcc, [[LOAD_1]], v0
+; GCN-NEXT: s_setpc_b64
+define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
+  %val_firststack = extractvalue [32 x i32] %large, 30
+  %val_laststack = extractvalue [32 x i32] %large, 31
+  %add0 = add i32 %arg0, %arg1
+  %add1 = add i32 %add0, %val_firststack
+  %add2 = add i32 %add1, %val_laststack
+  ret i32 %add2
+}
+
+; FIXME: Why load and store same location for stack args?
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
+; GCN: s_mov_b32 s5, s32
+
+; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
+
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
+
+; GCN-NOT: s32
+
+; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8
+
+; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
+
+; GCN-NOT: s32
+; GCN: s_setpc_b64
+define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
+; GCN-DAG: s_mov_b32 s5, s32
+; GCN-NOT: s32
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44
+
+; GCN-NOT: s32
+; GCN: s_setpc_b64
+define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
+entry:
+  %alloca = alloca [16 x i32], align 4
+  %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
+  store volatile i32 9, i32* %gep
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
+  ret i32 %ret
+}
+
+; If the callee requires more stack argument space than the caller,
+; don't do a tail call.
+; TODO: Do we really need this restriction?
+
+; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
+; GCN: s_swappc_b64
+; GCN: s_setpc_b64
+define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
+entry:
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
+  ret i32 %ret
+}
+
+; Have another non-tail in the function
+; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: v_writelane_b32 v34, s33, 0
+; GCN-DAG: v_writelane_b32 v34, s34, 1
+; GCN-DAG: v_writelane_b32 v34, s35, 2
+; GCN-DAG: s_add_u32 s32, s32, 0x400
+
+; GCN: s_getpc_b64
+; GCN: s_swappc_b64
+
+; GCN: s_getpc_b64 s[6:7]
+; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
+; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32 at rel32@hi+4
+
+; GCN-DAG: v_readlane_b32 s33, v34, 0
+; GCN-DAG: v_readlane_b32 s34, v34, 1
+; GCN-DAG: v_readlane_b32 s35, v34, 2
+
+; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN: s_sub_u32 s32, s32, 0x400
+; GCN: s_setpc_b64 s[6:7]
+define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
+entry:
+  %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
+  %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
+  ret i32 %ret
+}
+
+; Have stack object in caller and stack passed arguments. SP should be
+; in same place at function exit.
+
+; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
+; GCN: s_mov_b32 s5, s32
+; GCN-NOT: s32
+; GCN: s_setpc_b64 s[6:7]
+define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+entry:
+  %alloca = alloca [16 x i32], align 4
+  %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
+  store volatile i32 9, i32* %gep
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
+; GCN: s_mov_b32 s5, s32
+; GCN-NOT: s32
+; GCN: s_setpc_b64 s[6:7]
+define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
+entry:
+  %alloca = alloca [16 x i32], align 4
+  %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
+  store volatile i32 9, i32* %gep
+  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
+  ret i32 %ret
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noinline }

Added: llvm/trunk/test/CodeGen/AMDGPU/tail-call-cgp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/tail-call-cgp.ll?rev=310753&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/tail-call-cgp.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/tail-call-cgp.ll Fri Aug 11 13:42:08 2017
@@ -0,0 +1,43 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -codegenprepare %s | FileCheck %s
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) #0 {
+  store volatile i32 %a, i32* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: @func_caller(
+; CHECK: tail call fastcc void @callee(
+; CHECK-NEXT: ret void
+; CHECK: ret void
+define void @func_caller(i32* nocapture %p, i32 %a, i32 %b) #0 {
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %bb, label %ret
+
+bb:
+  tail call fastcc void @callee(i32* %p, i32 %a)
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-LABEL: @kernel_caller(
+; CHECK: tail call fastcc void @callee(
+; CHECK-NEXT: br label %ret
+
+; CHECK: ret void
+define amdgpu_kernel void @kernel_caller(i32* nocapture %p, i32 %a, i32 %b) #0 {
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %bb, label %ret
+
+bb:
+  tail call fastcc void @callee(i32* %p, i32 %a)
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }