[llvm-branch-commits] [llvm] release/22.x: x86: fix musttail sibcall miscompilation (#168956) (PR #176470)

Fri Jan 16 12:50:09 PST 2026

https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/176470

Backport 782bf6aff6ba6e9617bd3c4e27b3b9220ed5c850

Requested by: @rnk

>From be744b38c7a2b74bbb2baa88f6330e34f88061bc Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Wed, 14 Jan 2026 23:58:18 +0100
Subject: [PATCH] x86: fix musttail sibcall miscompilation (#168956)

fixes https://github.com/llvm/llvm-project/issues/56891
fixes https://github.com/llvm/llvm-project/issues/72390
fixes https://github.com/llvm/llvm-project/issues/147813

Currently the x86 backend miscompiles straightforward tail calls when
the stack is used for argument passing. This program segfaults on any
optimization level:

https://godbolt.org/z/5xr99jr4v

```c
typedef struct {
    uint64_t x;
    uint64_t y;
    uint64_t z;
} S;

__attribute__((noinline))
uint64_t callee(S s) {
    return s.x + s.y + s.z;
}

__attribute__((noinline))
uint64_t caller(S s) {
    [[clang::musttail]]
    return callee(s);
}
```

The immediate issue is that `caller` decides to shuffle values around on
the stack, and in the process writes to `*rsp`, which contains the
return address. With the return address trashed, the `ret` in `callee`
jumps to an invalid address.

```asm
caller:
        mov     rax, qword ptr [rsp + 24]
        mov     qword ptr [rsp + 16], rax
        movaps  xmm0, xmmword ptr [rsp + 8]
        movups  xmmword ptr [rsp], xmm0 ; <-- that is just all kinds of wrong
        movaps  xmmword ptr [rsp + 8], xmm0
        mov     qword ptr [rsp + 24], rax
        jmp     callee
```

However, I think the actual problem is that the x86 backend never
considers `musttail` calls to be sibcalls. For sibcalls, no stack
reshuffling is required at all, circumventing the problem here.

This PR essentially copies https://reviews.llvm.org/D131034 (cc
@huangjd), but this time I hope we can actually land this, and solve
this problem.

The aarch64 backend also miscompiled this example, but they appear to
have fixed it in LLVM 20.

Tail calls just not working for any sort of non-trivial argument types
is a blocker for tail call support in rust, see
https://github.com/rust-lang/rust/issues/144855#issuecomment-3536643185.

(cherry picked from commit 782bf6aff6ba6e9617bd3c4e27b3b9220ed5c850)
---
 llvm/lib/Target/X86/X86ISelLowering.h         |  16 +
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   | 192 +++++++++--
 llvm/test/CodeGen/X86/hipe-cc64.ll            |   3 +-
 llvm/test/CodeGen/X86/musttail-struct.ll      | 320 ++++++++++++++++++
 llvm/test/CodeGen/X86/musttail-tailcc.ll      |  18 -
 llvm/test/CodeGen/X86/sibcall.ll              |   9 +-
 ...c-store-ret-address-aliasing-stack-slot.ll |   6 +-
 llvm/test/CodeGen/X86/tailcallbyval64.ll      |   3 +-
 llvm/test/CodeGen/X86/tailccbyval64.ll        |   3 +-
 9 files changed, 504 insertions(+), 66 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/musttail-struct.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 7c8135d3a2013..a31ac8191ee40 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1072,6 +1072,19 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   //  X86 Implementation of the TargetLowering interface
   class X86TargetLowering final : public TargetLowering {
+    // Copying needed for an outgoing byval argument.
+    enum ByValCopyKind {
+      // Argument is already in the correct location, no copy needed.
+      NoCopy,
+      // Argument value is currently in the local stack frame, needs copying to
+      // outgoing arguemnt area.
+      CopyOnce,
+      // Argument value is currently in the outgoing argument area, but not at
+      // the correct offset, so needs copying via a temporary in local stack
+      // space.
+      CopyViaTemp,
+    };
+
   public:
     explicit X86TargetLowering(const X86TargetMachine &TM,
                                const X86Subtarget &STI);
@@ -1777,6 +1790,9 @@ namespace llvm {
     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src,
+                                            SDValue Dst,
+                                            ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 80299a639d3a3..7e1c894655f3f 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2009,6 +2009,49 @@ SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
+// Returns the type of copying which is required to set up a byval argument to
+// a tail-called function. This isn't needed for non-tail calls, because they
+// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
+// avoid clobbering another argument (CopyViaTemp), and sometimes can be
+// optimised to zero copies when forwarding an argument from the caller's
+// caller (NoCopy).
+X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall(
+    SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // Globals are always safe to copy from.
+  if (isa<GlobalAddressSDNode>(Src) || isa<ExternalSymbolSDNode>(Src))
+    return CopyOnce;
+
+  // Can only analyse frame index nodes, conservatively assume we need a
+  // temporary.
+  auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
+  auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
+  if (!SrcFrameIdxNode || !DstFrameIdxNode)
+    return CopyViaTemp;
+
+  int SrcFI = SrcFrameIdxNode->getIndex();
+  int DstFI = DstFrameIdxNode->getIndex();
+  assert(MFI.isFixedObjectIndex(DstFI) &&
+         "byval passed in non-fixed stack slot");
+
+  int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
+  int64_t DstOffset = MFI.getObjectOffset(DstFI);
+
+  // If the source is in the local frame, then the copy to the argument
+  // memory is always valid.
+  bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
+  if (!FixedSrc || (FixedSrc && SrcOffset < 0))
+    return CopyOnce;
+
+  // If the value is already in the correct location, then no copying is
+  // needed. If not, then we need to copy via a temporary.
+  if (SrcOffset == DstOffset)
+    return NoCopy;
+  else
+    return CopyViaTemp;
+}
+
 SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -2026,11 +2069,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget.is64Bit();
-  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
-  bool IsSibcall      = false;
-  bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
-      CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
-  bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
+  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+  bool ShouldGuaranteeTCO = shouldGuaranteeTCO(
+      CallConv, MF.getTarget().Options.GuaranteedTailCallOpt);
+  bool IsCalleePopSRet =
+      !ShouldGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   bool HasNCSR = (CB && isa<CallInst>(CB) &&
                   CB->hasFnAttr("no_caller_saved_registers"));
@@ -2077,7 +2120,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
-  if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
+  if (Subtarget.isPICStyleGOT() && !ShouldGuaranteeTCO && !IsMustTail) {
     // If we are using a GOT, disable tail calls to external symbols with
     // default visibility. Tail calling such a symbol requires using a GOT
     // relocation, which forces early binding of the symbol. This breaks code
@@ -2089,15 +2132,20 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isTailCall = false;
   }
 
-  if (isTailCall && !IsMustTail) {
-    // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
-                                                   IsCalleePopSRet);
-
-    // Sibcalls are automatically detected tailcalls which do not require
-    // ABI changes.
-    if (!IsGuaranteeTCO && isTailCall)
-      IsSibcall = true;
+  // Check if this tail call is a "sibling" call, which is loosely defined to
+  // be a tail call that doesn't require heroics like moving the return address
+  // or swapping byval arguments.
+  bool IsSibcall = false;
+  if (isTailCall) {
+    // We believe that this should be a tail call, now check if that is really
+    // possible.
+    IsSibcall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
+                                                  IsCalleePopSRet);
+
+    if (!IsMustTail) {
+      isTailCall = IsSibcall;
+      IsSibcall = IsSibcall && !ShouldGuaranteeTCO;
+    }
 
     if (isTailCall)
       ++NumTailCalls;
@@ -2116,13 +2164,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
+  else if (ShouldGuaranteeTCO && canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
+  // A sibcall is ABI-compatible and does not need to adjust the stack pointer.
   int FPDiff = 0;
-  if (isTailCall &&
-      shouldGuaranteeTCO(CallConv,
-                         MF.getTarget().Options.GuaranteedTailCallOpt)) {
+  if (isTailCall && ShouldGuaranteeTCO && !IsSibcall) {
     // Lower arguments at fp - stackoffset + fpdiff.
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
 
@@ -2137,6 +2184,80 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   unsigned NumBytesToPush = NumBytes;
   unsigned NumBytesToPop = NumBytes;
 
+  SDValue StackPtr;
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  // If we are doing a tail-call, any byval arguments will be written to stack
+  // space which was used for incoming arguments. If any the values being used
+  // are incoming byval arguments to this function, then they might be
+  // overwritten by the stores of the outgoing arguments. To avoid this, we
+  // need to make a temporary copy of them in local stack space, then copy back
+  // to the argument area.
+  // FIXME: There's potential to improve the code by using virtual registers for
+  // temporary storage, and letting the register allocator spill if needed.
+  SmallVector<SDValue, 8> ByValTemporaries;
+  SDValue ByValTempChain;
+  if (isTailCall) {
+    // Use null SDValue to mean "no temporary recorded for this arg index".
+    ByValTemporaries.assign(OutVals.size(), SDValue());
+
+    SmallVector<SDValue, 8> ByValCopyChains;
+    for (const CCValAssign &VA : ArgLocs) {
+      unsigned ArgIdx = VA.getValNo();
+      SDValue Src = OutVals[ArgIdx];
+      ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
+
+      if (!Flags.isByVal())
+        continue;
+
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+      if (!StackPtr.getNode())
+        StackPtr =
+            DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), PtrVT);
+
+      // Destination: where this byval should live in the callee’s frame
+      // after the tail call.
+      int64_t Offset = VA.getLocMemOffset() + FPDiff;
+      uint64_t Size = VA.getLocVT().getFixedSizeInBits() / 8;
+      int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset,
+                                                   /*IsImmutable=*/true);
+      SDValue Dst = DAG.getFrameIndex(FI, PtrVT);
+
+      ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
+
+      if (Copy == NoCopy) {
+        // If the argument is already at the correct offset on the stack
+        // (because we are forwarding a byval argument from our caller), we
+        // don't need any copying.
+        continue;
+      } else if (Copy == CopyOnce) {
+        // If the argument is in our local stack frame, no other argument
+        // preparation can clobber it, so we can copy it to the final location
+        // later.
+        ByValTemporaries[ArgIdx] = Src;
+      } else {
+        assert(Copy == CopyViaTemp && "unexpected enum value");
+        // If we might be copying this argument from the outgoing argument
+        // stack area, we need to copy via a temporary in the local stack
+        // frame.
+        MachineFrameInfo &MFI = MF.getFrameInfo();
+        int TempFrameIdx = MFI.CreateStackObject(Flags.getByValSize(),
+                                                 Flags.getNonZeroByValAlign(),
+                                                 /*isSS=*/false);
+        SDValue Temp =
+            DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
+
+        SDValue CopyChain =
+            CreateCopyOfByValArgument(Src, Temp, Chain, Flags, DAG, dl);
+        ByValCopyChains.push_back(CopyChain);
+      }
+    }
+    if (!ByValCopyChains.empty())
+      ByValTempChain =
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
+  }
+
   // If we have an inalloca argument, all stack space has already been allocated
   // for us and be right at the top of the stack.  We don't support multiple
   // arguments passed in memory when using inalloca.
@@ -2177,7 +2298,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
-  SDValue StackPtr;
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
@@ -2186,7 +2306,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
@@ -2276,7 +2395,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         if (ShadowReg)
           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
       }
-    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+    } else if (!IsSibcall && (!isTailCall || (isByVal && !IsMustTail))) {
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
@@ -2353,7 +2472,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   // don't need this because the eligibility check rejects calls that require
   // shuffling arguments passed in memory.
-  if (!IsSibcall && isTailCall) {
+  if (isTailCall && !IsSibcall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments or the return address are stored to the
     // stack, because the outgoing stack slots may alias the incoming argument
@@ -2363,6 +2482,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // would clobber.
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
+    if (ByValTempChain)
+      Chain =
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, ByValTempChain);
+
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
@@ -2395,16 +2518,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
-        // Copy relative to framepointer.
-        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
-        if (!StackPtr.getNode())
-          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                                        getPointerTy(DAG.getDataLayout()));
-        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                             StackPtr, Source);
-
-        MemOpChains2.push_back(
-            CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl));
+        if (SDValue ByValSrc = ByValTemporaries[OutsIndex]) {
+          auto PtrVT = getPointerTy(DAG.getDataLayout());
+          SDValue DstAddr = DAG.getFrameIndex(FI, PtrVT);
+
+          MemOpChains2.push_back(CreateCopyOfByValArgument(
+              ByValSrc, DstAddr, Chain, Flags, DAG, dl));
+        }
       } else {
         // Store relative to framepointer.
         MemOpChains2.push_back(DAG.getStore(
@@ -2837,8 +2957,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   bool CCMatch = CallerCC == CalleeCC;
   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
-  bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
-      CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
+  bool ShouldGuaranteeTCO = shouldGuaranteeTCO(
+      CalleeCC, MF.getTarget().Options.GuaranteedTailCallOpt);
 
   // Win64 functions have extra shadow space for argument homing. Don't do the
   // sibcall if the caller and callee have mismatched expectations for this
@@ -2846,7 +2966,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   if (IsCalleeWin64 != IsCallerWin64)
     return false;
 
-  if (IsGuaranteeTCO) {
+  if (ShouldGuaranteeTCO) {
     if (canGuaranteeTCO(CalleeCC) && CCMatch)
       return true;
     return false;
diff --git a/llvm/test/CodeGen/X86/hipe-cc64.ll b/llvm/test/CodeGen/X86/hipe-cc64.ll
index d8505641cd789..4cb033b1a6580 100644
--- a/llvm/test/CodeGen/X86/hipe-cc64.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc64.ll
@@ -21,14 +21,13 @@ define void @zap(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    movl $2, %ecx
 ; CHECK-NEXT:    movl $3, %r8d
 ; CHECK-NEXT:    movq %rax, %r9
-; CHECK-NEXT:    callq foo at PLT
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    jmp foo at PLT # TAILCALL
 entry:
   %0 = call cc 11 {i64, i64, i64} @addfour(i64 undef, i64 undef, i64 %a, i64 %b, i64 8, i64 9)
   %res = extractvalue {i64, i64, i64} %0, 2
diff --git a/llvm/test/CodeGen/X86/musttail-struct.ll b/llvm/test/CodeGen/X86/musttail-struct.ll
new file mode 100644
index 0000000000000..735fd674a2ff1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/musttail-struct.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -x86-asm-syntax=intel | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-asm-syntax=intel | FileCheck %s --check-prefix=X64
+
+; Test correct handling of a musttail call with a byval struct argument.
+
+%struct.1xi32 = type { [1 x i32] }
+%struct.3xi32 = type { [3 x i32] }
+%struct.5xi32 = type { [5 x i32] }
+
+declare dso_local i32 @Func1(ptr byval(%struct.1xi32) %0)
+declare dso_local i32 @Func3(ptr byval(%struct.3xi32) %0)
+declare dso_local i32 @Func5(ptr byval(%struct.5xi32) %0)
+declare dso_local i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+
+define dso_local i32 @test1(ptr byval(%struct.1xi32) %0) {
+; X32-LABEL: test1:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp Func1 # TAILCALL
+;
+; X64-LABEL: test1:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp Func1 # TAILCALL
+  %r = musttail call i32 @Func1(ptr byval(%struct.1xi32) %0)
+  ret i32 %r
+}
+
+define dso_local i32 @test3(ptr byval(%struct.3xi32) %0) {
+; X32-LABEL: test3:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp Func3 # TAILCALL
+;
+; X64-LABEL: test3:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp Func3 # TAILCALL
+  %r = musttail call i32 @Func3(ptr byval(%struct.3xi32) %0)
+  ret i32 %r
+}
+
+; sizeof(%struct.5xi32) > 16, in x64 this is passed on stack.
+define dso_local i32 @test5(ptr byval(%struct.5xi32) %0) {
+; X32-LABEL: test5:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp Func5 # TAILCALL
+;
+; X64-LABEL: test5:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp Func5 # TAILCALL
+  %r = musttail call i32 @Func5(ptr byval(%struct.5xi32) %0)
+  ret i32 %r
+}
+
+; Test passing multiple arguments with different sizes on stack. In x64 Linux
+; the first 6 are passed by register.
+define dso_local i32 @testManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) {
+; X32-LABEL: testManyArgs:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp FuncManyArgs # TAILCALL
+;
+; X64-LABEL: testManyArgs:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp FuncManyArgs # TAILCALL
+  %r = musttail call i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+  ret i32 %r
+}
+
+define dso_local i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) {
+; X32-LABEL: testRecursion:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp testRecursion # TAILCALL
+;
+; X64-LABEL: testRecursion:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp testRecursion # TAILCALL
+  %r = musttail call i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+  ret i32 %r
+}
+
+define dso_local i32 @swap(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) noinline {
+; X32-LABEL: swap:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    mov eax, dword ptr [esp + 4]
+; X32-NEXT:    add eax, dword ptr [esp + 8]
+; X32-NEXT:    ret
+;
+; X64-LABEL: swap:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    mov eax, dword ptr [rsp + 8]
+; X64-NEXT:    add eax, dword ptr [rsp + 16]
+; X64-NEXT:    ret
+entry:
+  %a.ptr = getelementptr inbounds %struct.1xi32, ptr %0, i32 0, i32 0, i32 0
+  %a     = load i32, ptr %a.ptr, align 4
+  %b.ptr = getelementptr inbounds %struct.1xi32, ptr %1, i32 0, i32 0, i32 0
+  %b     = load i32, ptr %b.ptr, align 4
+  %sum   = add i32 %a, %b
+  ret i32 %sum
+}
+
+define dso_local i32 @swapByValArguments(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) {
+; X32-LABEL: swapByValArguments:
+; X32:       # %bb.0:
+; X32-NEXT:    sub esp, 8
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    mov eax, dword ptr [esp + 12]
+; X32-NEXT:    mov dword ptr [esp], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 16]
+; X32-NEXT:    mov dword ptr [esp + 4], eax
+; X32-NEXT:    add esp, 8
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp swap # TAILCALL
+;
+; X64-LABEL: swapByValArguments:
+; X64:       # %bb.0:
+; X64-NEXT:    mov eax, dword ptr [rsp + 8]
+; X64-NEXT:    mov dword ptr [rsp - 16], eax
+; X64-NEXT:    mov eax, dword ptr [rsp + 16]
+; X64-NEXT:    mov dword ptr [rsp - 8], eax
+; X64-NEXT:    jmp swap # TAILCALL
+  %r = musttail call i32 @swap(ptr byval(%struct.1xi32) %1, ptr byval(%struct.1xi32) %0)
+  ret i32 %r
+}
+
+; Clang only uses byval for arguments of 65 bytes or larger, but e.g. rustc
+; does use byval for smaller types. Here we use a 20 byte struct to keep
+; the tests more readable.
+%twenty_bytes = type { [5 x i32] }
+declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+
+; Functions with byval parameters can be tail-called, because the value is
+; actually passed in registers and the stack in the same way for the caller and
+; callee. On x86 byval arguments are never (partially) passed via registers.
+define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; X32-LABEL: large_caller:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    jmp large_callee at PLT # TAILCALL
+;
+; X64-LABEL: large_caller:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp large_callee at PLT # TAILCALL
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; The IR for this one looks dodgy, because it has an alloca passed to a
+; musttail function, but it is passed as a byval argument, so will be copied
+; into the stack space allocated by @large_caller_new_value's caller, so is
+; valid.
+define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; X32-LABEL: large_caller_new_value:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    sub esp, 20
+; X32-NEXT:    .cfi_def_cfa_offset 24
+; X32-NEXT:    mov dword ptr [esp], 0
+; X32-NEXT:    mov dword ptr [esp + 4], 1
+; X32-NEXT:    mov dword ptr [esp + 8], 2
+; X32-NEXT:    mov dword ptr [esp + 12], 3
+; X32-NEXT:    mov dword ptr [esp + 16], 4
+; X32-NEXT:    mov dword ptr [esp + 24], 0
+; X32-NEXT:    mov dword ptr [esp + 28], 1
+; X32-NEXT:    mov dword ptr [esp + 32], 2
+; X32-NEXT:    mov dword ptr [esp + 36], 3
+; X32-NEXT:    mov dword ptr [esp + 40], 4
+; X32-NEXT:    add esp, 20
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp large_callee at PLT # TAILCALL
+;
+; X64-LABEL: large_caller_new_value:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabs rax, 4294967296
+; X64-NEXT:    mov qword ptr [rsp - 20], rax
+; X64-NEXT:    movabs rcx, 12884901890
+; X64-NEXT:    mov qword ptr [rsp - 12], rcx
+; X64-NEXT:    mov dword ptr [rsp - 4], 4
+; X64-NEXT:    mov qword ptr [rsp + 8], rax
+; X64-NEXT:    mov qword ptr [rsp + 16], rcx
+; X64-NEXT:    mov dword ptr [rsp + 24], 4
+; X64-NEXT:    jmp large_callee at PLT # TAILCALL
+entry:
+  %y = alloca %twenty_bytes, align 4
+  store i32 0, ptr %y, align 4
+  %0 = getelementptr inbounds i8, ptr %y, i32 4
+  store i32 1, ptr %0, align 4
+  %1 = getelementptr inbounds i8, ptr %y, i32 8
+  store i32 2, ptr %1, align 4
+  %2 = getelementptr inbounds i8, ptr %y, i32 12
+  store i32 3, ptr %2, align 4
+  %3 = getelementptr inbounds i8, ptr %y, i32 16
+  store i32 4, ptr %3, align 4
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
+  ret void
+}
+
+declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4)
+define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; X32-LABEL: swap_byvals:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    sub esp, 40
+; X32-NEXT:    .cfi_def_cfa_offset 44
+; X32-NEXT:    mov eax, dword ptr [esp + 60]
+; X32-NEXT:    mov dword ptr [esp + 16], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 56]
+; X32-NEXT:    mov dword ptr [esp + 12], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 52]
+; X32-NEXT:    mov dword ptr [esp + 8], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 44]
+; X32-NEXT:    mov ecx, dword ptr [esp + 48]
+; X32-NEXT:    mov dword ptr [esp + 4], ecx
+; X32-NEXT:    mov dword ptr [esp], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 80]
+; X32-NEXT:    mov dword ptr [esp + 36], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 76]
+; X32-NEXT:    mov dword ptr [esp + 32], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 72]
+; X32-NEXT:    mov dword ptr [esp + 28], eax
+; X32-NEXT:    mov eax, dword ptr [esp + 64]
+; X32-NEXT:    mov ecx, dword ptr [esp + 68]
+; X32-NEXT:    mov dword ptr [esp + 24], ecx
+; X32-NEXT:    mov dword ptr [esp + 20], eax
+; X32-NEXT:    add esp, 40
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp two_byvals_callee at PLT # TAILCALL
+;
+; X64-LABEL: swap_byvals:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    mov eax, dword ptr [rsp + 24]
+; X64-NEXT:    mov dword ptr [rsp - 8], eax
+; X64-NEXT:    movaps xmm0, xmmword ptr [rsp + 8]
+; X64-NEXT:    movaps xmmword ptr [rsp - 24], xmm0
+; X64-NEXT:    mov eax, dword ptr [rsp + 48]
+; X64-NEXT:    mov dword ptr [rsp - 32], eax
+; X64-NEXT:    mov rax, qword ptr [rsp + 32]
+; X64-NEXT:    mov rcx, qword ptr [rsp + 40]
+; X64-NEXT:    mov qword ptr [rsp - 40], rcx
+; X64-NEXT:    mov qword ptr [rsp - 48], rax
+; X64-NEXT:    jmp two_byvals_callee at PLT # TAILCALL
+entry:
+  musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; A forwarded byval arg, but at a different argument position. Because
+; x86 does not (partially) pass byval arguments in registers, the byval
+; arg is in the correct position already, so this is not a sibcall but
+; can be tail-call optimized.
+declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; X32-LABEL: shift_byval:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    push edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    push esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset esi, -12
+; X32-NEXT:    .cfi_offset edi, -8
+; X32-NEXT:    mov eax, dword ptr [esp + 32]
+; X32-NEXT:    mov ecx, dword ptr [esp + 28]
+; X32-NEXT:    mov edx, dword ptr [esp + 24]
+; X32-NEXT:    mov esi, dword ptr [esp + 16]
+; X32-NEXT:    mov edi, dword ptr [esp + 20]
+; X32-NEXT:    push eax
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push ecx
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push edx
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push edi
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    push esi
+; X32-NEXT:    .cfi_adjust_cfa_offset 4
+; X32-NEXT:    call shift_byval_callee at PLT
+; X32-NEXT:    add esp, 20
+; X32-NEXT:    .cfi_adjust_cfa_offset -20
+; X32-NEXT:    pop esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pop edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    ret
+;
+; X64-LABEL: shift_byval:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp shift_byval_callee at PLT # TAILCALL
+entry:
+  tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b)
+  ret void
+}
+
+; A global object passed to a byval argument, so it must be copied, but doesn't
+; need a stack temporary.
+ at large_global = external global %twenty_bytes
+define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; X32-LABEL: large_caller_from_global:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    mov eax, dword ptr [large_global+16]
+; X32-NEXT:    mov dword ptr [esp + 20], eax
+; X32-NEXT:    mov eax, dword ptr [large_global+12]
+; X32-NEXT:    mov dword ptr [esp + 16], eax
+; X32-NEXT:    mov eax, dword ptr [large_global+8]
+; X32-NEXT:    mov dword ptr [esp + 12], eax
+; X32-NEXT:    mov eax, dword ptr [large_global+4]
+; X32-NEXT:    mov dword ptr [esp + 8], eax
+; X32-NEXT:    mov eax, dword ptr [large_global]
+; X32-NEXT:    mov dword ptr [esp + 4], eax
+; X32-NEXT:    jmp large_callee at PLT # TAILCALL
+;
+; X64-LABEL: large_caller_from_global:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    mov rax, qword ptr [rip + large_global at GOTPCREL]
+; X64-NEXT:    mov ecx, dword ptr [rax + 16]
+; X64-NEXT:    mov dword ptr [rsp + 24], ecx
+; X64-NEXT:    mov rcx, qword ptr [rax]
+; X64-NEXT:    mov rax, qword ptr [rax + 8]
+; X64-NEXT:    mov qword ptr [rsp + 16], rax
+; X64-NEXT:    mov qword ptr [rsp + 8], rcx
+; X64-NEXT:    jmp large_callee at PLT # TAILCALL
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/musttail-tailcc.ll b/llvm/test/CodeGen/X86/musttail-tailcc.ll
index fae698d53b927..f1ffbcb1142c5 100644
--- a/llvm/test/CodeGen/X86/musttail-tailcc.ll
+++ b/llvm/test/CodeGen/X86/musttail-tailcc.ll
@@ -55,15 +55,6 @@ define dso_local tailcc void @void_test(i32, i32, i32, i32) {
 ;
 ; X86-LABEL: void_test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    jmp void_test # TAILCALL
   entry:
    musttail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
@@ -77,15 +68,6 @@ define dso_local tailcc i1 @i1test(i32, i32, i32, i32) {
 ;
 ; X86-LABEL: i1test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    jmp i1test # TAILCALL
   entry:
   %4 = musttail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index 2759a9883975e..d1137cac7d365 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -295,10 +295,15 @@ declare dso_local i32 @foo5(i32, i32, i32, i32, i32)
 define dso_local i32 @t12(i32 %x, i32 %y, ptr byval(%struct.t) align 4 %z) nounwind ssp {
 ; X86-LABEL: t12:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    jne foo6 # TAILCALL
-; X86-NEXT:  # %bb.1: # %bb2
+; X86-NEXT:    je .LBB12_1
+; X86-NEXT:  # %bb.2: # %bb
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    jmp foo6 # TAILCALL
+; X86-NEXT:  .LBB12_1: # %bb2
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $20, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t12:
diff --git a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
index cd669768705e5..b901d22f66392 100644
--- a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
+++ b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc %s -o - | FileCheck %s
 
 target triple = "x86_64-apple-macosx"
@@ -24,9 +25,7 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4,
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; CHECK-NEXT:    callq _foo
 ; CHECK-NEXT:    movq %r14, (%rax)
-; CHECK-NEXT:    movl [[OFF:[0-9]+]](%rsp), %edx
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq %rcx, [[OFF]](%rsp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    movq %rax, %r14
 ; CHECK-NEXT:    movq %r13, %rdi
 ; CHECK-NEXT:    movq %r15, %rsi
@@ -34,7 +33,6 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4,
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r15
-; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    jmp _tc_fn ## TAILCALL
 entry:
   %res = tail call ptr @foo()
diff --git a/llvm/test/CodeGen/X86/tailcallbyval64.ll b/llvm/test/CodeGen/X86/tailcallbyval64.ll
index 3d2f6392bd150..e44e156a7ad4d 100644
--- a/llvm/test/CodeGen/X86/tailcallbyval64.ll
+++ b/llvm/test/CodeGen/X86/tailcallbyval64.ll
@@ -5,8 +5,7 @@
 ; Expect the entry point.
 ; CHECK-LABEL: tailcaller:
 
-; Expect 2 rep;movs because of tail call byval lowering.
-; CHECK: rep;
+; Expect 1 rep;movs because of tail call stack argument lowering.
 ; CHECK: rep;
 
 ; A sequence of copyto/copyfrom virtual registers is used to deal with byval
diff --git a/llvm/test/CodeGen/X86/tailccbyval64.ll b/llvm/test/CodeGen/X86/tailccbyval64.ll
index c08a9a77bfb88..6b440058c84b5 100644
--- a/llvm/test/CodeGen/X86/tailccbyval64.ll
+++ b/llvm/test/CodeGen/X86/tailccbyval64.ll
@@ -5,8 +5,7 @@
 ; Expect the entry point.
 ; CHECK-LABEL: tailcaller:
 
-; Expect 2 rep;movs because of tail call byval lowering.
-; CHECK: rep;
+; Expect 1 rep;movs because of tail call stack argument lowering.
 ; CHECK: rep;
 
 ; A sequence of copyto/copyfrom virtual registers is used to deal with byval