[llvm] r296683 - Elide argument copies during instruction selection

Wed Mar 1 13:42:01 PST 2017

Author: rnk
Date: Wed Mar  1 15:42:00 2017
New Revision: 296683

URL: http://llvm.org/viewvc/llvm-project?rev=296683&view=rev
Log:
Elide argument copies during instruction selection

Summary:
Avoids tons of prologue boilerplate when arguments are passed in memory
and left in memory. This can happen in a debug build or in a release
build when an argument alloca is escaped.  This will dramatically affect
the code size of x86 debug builds, because X86 fast isel doesn't handle
arguments passed in memory at all. It only handles the x86_64 case of up
to 6 basic register parameters.

This is implemented by analyzing the entry block before ISel to identify
copy elision candidates. A copy elision candidate is an argument that is
used to fully initialize an alloca before any other possibly escaping
uses of that alloca. If an argument is a copy elision candidate, we set
a flag on the InputArg. If the the target generates loads from a fixed
stack object that matches the size and alignment requirements of the
alloca, the SelectionDAG builder will delete the stack object created
for the alloca and replace it with the fixed stack object. The load is
left behind to satisfy any remaining uses of the argument value. The
store is now dead and is therefore elided. The fixed stack object is
also marked as mutable, as it may now be modified by the user, and it
would be invalid to rematerialize the initial load from it.

Supersedes D28388

Fixes PR26328

Reviewers: chandlerc, MatzeB, qcolombet, inglorion, hans

Subscribers: igorb, llvm-commits

Differential Revision: https://reviews.llvm.org/D29668

Added:
    llvm/trunk/test/CodeGen/ARM/arg-copy-elide.ll
    llvm/trunk/test/CodeGen/X86/arg-copy-elide.ll
Modified:
    llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h
    llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h
    llvm/trunk/include/llvm/Target/TargetCallingConv.h
    llvm/trunk/lib/CodeGen/AsmPrinter/DwarfDebug.h
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-abi-varargs.ll
    llvm/trunk/test/CodeGen/Mips/o32_cc_vararg.ll
    llvm/trunk/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
    llvm/trunk/test/CodeGen/X86/inline-asm-tied.ll
    llvm/trunk/test/CodeGen/X86/pr30430.ll
    llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
    llvm/trunk/test/DebugInfo/X86/discriminator.ll

Modified: llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h (original)
+++ llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h Wed Mar  1 15:42:00 2017
@@ -559,8 +559,7 @@ public:
     return Objects[ObjectIdx+NumFixedObjects].isAliased;
   }
 
-  /// isImmutableObjectIndex - Returns true if the specified index corresponds
-  /// to an immutable object.
+  /// Returns true if the specified index corresponds to an immutable object.
   bool isImmutableObjectIndex(int ObjectIdx) const {
     // Tail calling functions can clobber their function arguments.
     if (HasTailCall)
@@ -570,6 +569,13 @@ public:
     return Objects[ObjectIdx+NumFixedObjects].isImmutable;
   }
 
+  /// Marks the immutability of an object.
+  void setIsImmutableObjectIndex(int ObjectIdx, bool Immutable) {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    Objects[ObjectIdx+NumFixedObjects].isImmutable = Immutable;
+  }
+
   /// Returns true if the specified index corresponds to a spill slot.
   bool isSpillSlotObjectIndex(int ObjectIdx) const {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&

Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h (original)
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h Wed Mar  1 15:42:00 2017
@@ -54,6 +54,7 @@ public:
   const TargetInstrInfo *TII;
   const TargetLowering *TLI;
   bool FastISelFailed;
+  SmallPtrSet<const Instruction *, 4> ElidedArgCopyInstrs;
 
   static char ID;
 

Modified: llvm/trunk/include/llvm/Target/TargetCallingConv.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetCallingConv.h?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetCallingConv.h (original)
+++ llvm/trunk/include/llvm/Target/TargetCallingConv.h Wed Mar  1 15:42:00 2017
@@ -45,6 +45,7 @@ namespace ISD {
     unsigned OrigAlign : 5;    ///< Log 2 of original alignment
     unsigned IsInConsecutiveRegsLast : 1;
     unsigned IsInConsecutiveRegs : 1;
+    unsigned IsCopyElisionCandidate : 1; ///< Argument copy elision candidate
 
     unsigned ByValSize; ///< Byval struct size
 
@@ -54,7 +55,8 @@ namespace ISD {
           IsReturned(0), IsSplit(0), IsInAlloca(0), IsSplitEnd(0),
           IsSwiftSelf(0), IsSwiftError(0), IsHva(0), IsHvaStart(0),
           IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
-          IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0), ByValSize(0) {
+          IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
+          IsCopyElisionCandidate(0), ByValSize(0) {
       static_assert(sizeof(*this) == 2 * sizeof(unsigned), "flags are too big");
     }
 
@@ -109,6 +111,9 @@ namespace ISD {
     bool isSplitEnd()   const { return IsSplitEnd; }
     void setSplitEnd()  { IsSplitEnd = 1; }
 
+    bool isCopyElisionCandidate()  const { return IsCopyElisionCandidate; }
+    void setCopyElisionCandidate() { IsCopyElisionCandidate = 1; }
+
     unsigned getByValAlign() const { return (1U << ByValAlign) / 2; }
     void setByValAlign(unsigned A) {
       ByValAlign = Log2_32(A) + 1;

Modified: llvm/trunk/lib/CodeGen/AsmPrinter/DwarfDebug.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/AsmPrinter/DwarfDebug.h?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/AsmPrinter/DwarfDebug.h (original)
+++ llvm/trunk/lib/CodeGen/AsmPrinter/DwarfDebug.h Wed Mar  1 15:42:00 2017
@@ -89,7 +89,7 @@ public:
     assert(!MInsn && "Already initialized?");
 
     assert((!E || E->isValid()) && "Expected valid expression");
-    assert(~FI && "Expected valid index");
+    assert(FI != INT_MAX && "Expected valid index");
 
     FrameIndexExprs.push_back({FI, E});
   }

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Wed Mar  1 15:42:00 2017
@@ -8028,6 +8028,173 @@ static bool isOnlyUsedInEntryBlock(const
   return true;
 }
 
+typedef DenseMap<const Argument *,
+                 std::pair<const AllocaInst *, const StoreInst *>>
+    ArgCopyElisionMapTy;
+
+/// Scan the entry block of the function in FuncInfo for arguments that look
+/// like copies into a local alloca. Record any copied arguments in
+/// ArgCopyElisionCandidates.
+static void
+findArgumentCopyElisionCandidates(const DataLayout &DL,
+                                  FunctionLoweringInfo *FuncInfo,
+                                  ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
+  // Record the state of every static alloca used in the entry block. Argument
+  // allocas are all used in the entry block, so we need approximately as many
+  // entries as we have arguments.
+  enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
+  SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
+  unsigned NumArgs = FuncInfo->Fn->getArgumentList().size();
+  StaticAllocas.reserve(NumArgs * 2);
+
+  auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
+    if (!V)
+      return nullptr;
+    V = V->stripPointerCasts();
+    const auto *AI = dyn_cast<AllocaInst>(V);
+    if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
+      return nullptr;
+    auto Iter = StaticAllocas.insert({AI, Unknown});
+    return &Iter.first->second;
+  };
+
+  // Look for stores of arguments to static allocas. Look through bitcasts and
+  // GEPs to handle type coercions, as long as the alloca is fully initialized
+  // by the store. Any non-store use of an alloca escapes it and any subsequent
+  // unanalyzed store might write it.
+  // FIXME: Handle structs initialized with multiple stores.
+  for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
+    // Look for stores, and handle non-store uses conservatively.
+    const auto *SI = dyn_cast<StoreInst>(&I);
+    if (!SI) {
+      // We will look through cast uses, so ignore them completely.
+      if (I.isCast())
+        continue;
+      // Ignore debug info intrinsics, they don't escape or store to allocas.
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      // This is an unknown instruction. Assume it escapes or writes to all
+      // static alloca operands.
+      for (const Use &U : I.operands()) {
+        if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
+          *Info = StaticAllocaInfo::Clobbered;
+      }
+      continue;
+    }
+
+    // If the stored value is a static alloca, mark it as escaped.
+    if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
+      *Info = StaticAllocaInfo::Clobbered;
+
+    // Check if the destination is a static alloca.
+    const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
+    StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
+    if (!Info)
+      continue;
+    const AllocaInst *AI = cast<AllocaInst>(Dst);
+
+    // Skip allocas that have been initialized or clobbered.
+    if (*Info != StaticAllocaInfo::Unknown)
+      continue;
+
+    // Check if the stored value is an argument, and that this store fully
+    // initializes the alloca. Don't elide copies from the same argument twice.
+    const Value *Val = SI->getValueOperand()->stripPointerCasts();
+    const auto *Arg = dyn_cast<Argument>(Val);
+    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
+        Arg->getType()->isEmptyTy() ||
+        DL.getTypeStoreSize(Arg->getType()) !=
+            DL.getTypeAllocSize(AI->getAllocatedType()) ||
+        ArgCopyElisionCandidates.count(Arg)) {
+      *Info = StaticAllocaInfo::Clobbered;
+      continue;
+    }
+
+    DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');
+
+    // Mark this alloca and store for argument copy elision.
+    *Info = StaticAllocaInfo::Elidable;
+    ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
+
+    // Stop scanning if we've seen all arguments. This will happen early in -O0
+    // builds, which is useful, because -O0 builds have large entry blocks and
+    // many allocas.
+    if (ArgCopyElisionCandidates.size() == NumArgs)
+      break;
+  }
+}
+
+/// Try to elide argument copies from memory into a local alloca. Succeeds if
+/// ArgVal is a load from a suitable fixed stack object.
+static void tryToElideArgumentCopy(
+    FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
+    DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
+    SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
+    ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
+    SDValue ArgVal, bool &ArgHasUses) {
+  // Check if this is a load from a fixed stack object.
+  auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
+  if (!LNode)
+    return;
+  auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
+  if (!FINode)
+    return;
+
+  // Check that the fixed stack object is the right size and alignment.
+  // Look at the alignment that the user wrote on the alloca instead of looking
+  // at the stack object.
+  auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
+  assert(ArgCopyIter != ArgCopyElisionCandidates.end());
+  const AllocaInst *AI = ArgCopyIter->second.first;
+  int FixedIndex = FINode->getIndex();
+  int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
+  int OldIndex = AllocaIndex;
+  MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
+  if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
+    DEBUG(dbgs() << "  argument copy elision failed due to bad fixed stack "
+                    "object size\n");
+    return;
+  }
+  unsigned RequiredAlignment = AI->getAlignment();
+  if (!RequiredAlignment) {
+    RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
+        AI->getAllocatedType());
+  }
+  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
+    DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
+                    "greater than stack argument alignment ("
+                 << RequiredAlignment << " vs "
+                 << MFI.getObjectAlignment(FixedIndex) << ")\n");
+    return;
+  }
+
+  // Perform the elision. Delete the old stack object and replace its only use
+  // in the variable info map. Mark the stack object as mutable.
+  DEBUG({
+    dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
+           << "  Replacing frame index " << OldIndex << " with " << FixedIndex
+           << '\n';
+  });
+  MFI.RemoveStackObject(OldIndex);
+  MFI.setIsImmutableObjectIndex(FixedIndex, false);
+  AllocaIndex = FixedIndex;
+  ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
+  Chains.push_back(ArgVal.getValue(1));
+
+  // Avoid emitting code for the store implementing the copy.
+  const StoreInst *SI = ArgCopyIter->second.second;
+  ElidedArgCopyInstrs.insert(SI);
+
+  // Check for uses of the argument again so that we can avoid exporting ArgVal
+  // if it is't used by anything other than the store.
+  for (const Value *U : Arg.users()) {
+    if (U != SI) {
+      ArgHasUses = true;
+      break;
+    }
+  }
+}
+
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
@@ -8050,6 +8217,12 @@ void SelectionDAGISel::LowerArguments(co
     Ins.push_back(RetArg);
   }
 
+  // Look for stores of arguments to static allocas. Mark such arguments with a
+  // flag to ask the target to give us the memory location of that argument if
+  // available.
+  ArgCopyElisionMapTy ArgCopyElisionCandidates;
+  findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
+
   // Set up the incoming argument description vector.
   unsigned Idx = 0;
   for (const Argument &Arg : F.args()) {
@@ -8127,6 +8300,8 @@ void SelectionDAGISel::LowerArguments(co
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
+      if (ArgCopyElisionCandidates.count(&Arg))
+        Flags.setCopyElisionCandidate();
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
@@ -8199,19 +8374,33 @@ void SelectionDAGISel::LowerArguments(co
     ++i;
   }
 
+  SmallVector<SDValue, 4> Chains;
+  DenseMap<int, int> ArgCopyElisionFrameIndexMap;
   for (const Argument &Arg : F.args()) {
     ++Idx;
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
+    if (NumValues == 0)
+      continue;
+
+    bool ArgHasUses = !Arg.use_empty();
+
+    // Elide the copying store if the target loaded this argument from a
+    // suitable fixed stack object.
+    if (Ins[i].Flags.isCopyElisionCandidate()) {
+      tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
+                             ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
+                             InVals[i], ArgHasUses);
+    }
 
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
     bool isSwiftErrorArg =
         TLI->supportSwiftError() &&
         F.getAttributes().hasAttribute(Idx, Attribute::SwiftError);
-    if (Arg.use_empty() && NumValues && !isSwiftErrorArg) {
+    if (!ArgHasUses && !isSwiftErrorArg) {
       SDB->setUnusedArgValue(&Arg, InVals[i]);
 
       // Also remember any frame index for use in FastISel.
@@ -8228,16 +8417,15 @@ void SelectionDAGISel::LowerArguments(co
       // Even an apparant 'unused' swifterror argument needs to be returned. So
       // we do generate a copy for it that can be used on return from the
       // function.
-      if (!Arg.use_empty() || isSwiftErrorArg) {
+      if (ArgHasUses || isSwiftErrorArg) {
         Optional<ISD::NodeType> AssertOp;
         if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
           AssertOp = ISD::AssertSext;
         else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
           AssertOp = ISD::AssertZext;
 
-        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
-                                             NumParts, PartVT, VT,
-                                             nullptr, AssertOp));
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
+                                             PartVT, VT, nullptr, AssertOp));
       }
 
       i += NumParts;
@@ -8291,8 +8479,26 @@ void SelectionDAGISel::LowerArguments(co
     }
   }
 
+  if (!Chains.empty()) {
+    Chains.push_back(NewRoot);
+    NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  DAG.setRoot(NewRoot);
+
   assert(i == InVals.size() && "Argument register count mismatch!");
 
+  // If any argument copy elisions occurred and we have debug info, update the
+  // stale frame indices used in the dbg.declare variable info table.
+  MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
+  if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
+    for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
+      auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
+      if (I != ArgCopyElisionFrameIndexMap.end())
+        VI.Slot = I->second;
+    }
+  }
+
   // Finally, if the target has anything special to do, allow it to do so.
   EmitFunctionEntryCode();
 }

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp Wed Mar  1 15:42:00 2017
@@ -713,8 +713,10 @@ void SelectionDAGISel::SelectBasicBlock(
                                         bool &HadTailCall) {
   // Lower the instructions. If a call is emitted as a tail call, cease emitting
   // nodes for this block.
-  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
-    SDB->visit(*I);
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
+    if (!ElidedArgCopyInstrs.count(&*I))
+      SDB->visit(*I);
+  }
 
   // Make sure the root of the DAG is up-to-date.
   CurDAG->setRoot(SDB->getControlRoot());
@@ -1564,7 +1566,8 @@ void SelectionDAGISel::SelectAllBasicBlo
         const Instruction *Inst = &*std::prev(BI);
 
         // If we no longer require this instruction, skip it.
-        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo) ||
+            ElidedArgCopyInstrs.count(Inst)) {
           --NumFastIselRemaining;
           continue;
         }
@@ -1694,6 +1697,7 @@ void SelectionDAGISel::SelectAllBasicBlo
 
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
+    ElidedArgCopyInstrs.clear();
   }
 
   propagateSwiftErrorVRegs(FuncInfo);

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Mar  1 15:42:00 2017
@@ -2691,6 +2691,7 @@ X86TargetLowering::LowerMemArgument(SDVa
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // If value is passed by pointer we have address passed instead of the value
   // itself. No need to extend if the mask value and location share the same
@@ -2729,30 +2730,71 @@ X86TargetLowering::LowerMemArgument(SDVa
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
     }
-    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-  } else {
-    int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
-                                   VA.getLocMemOffset(), isImmutable);
-
-    // Set SExt or ZExt flag.
-    if (VA.getLocInfo() == CCValAssign::ZExt) {
-      MFI.setObjectZExt(FI, true);
-    } else if (VA.getLocInfo() == CCValAssign::SExt) {
-      MFI.setObjectSExt(FI, true);
-    }
+    return DAG.getFrameIndex(FI, PtrVT);
+  }
 
-    // Adjust SP offset of interrupt parameter.
-    if (CallConv == CallingConv::X86_INTR) {
-      MFI.setObjectOffset(FI, Offset);
+  // This is an argument in memory. We might be able to perform copy elision.
+  if (Flags.isCopyElisionCandidate()) {
+    EVT ArgVT = Ins[i].ArgVT;
+    SDValue PartAddr;
+    if (Ins[i].PartOffset == 0) {
+      // If this is a one-part value or the first part of a multi-part value,
+      // create a stack object for the entire argument value type and return a
+      // load from our portion of it. This assumes that if the first part of an
+      // argument is in memory, the rest will also be in memory.
+      int FI = MFI.CreateFixedObject(ArgVT.getSizeInBits() / 8,
+                                     VA.getLocMemOffset(), /*Immutable=*/false);
+      PartAddr = DAG.getFrameIndex(FI, PtrVT);
+      return DAG.getLoad(
+          ValVT, dl, Chain, PartAddr,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+    } else {
+      // This is not the first piece of an argument in memory. See if there is
+      // already a fixed stack object including this offset. If so, assume it
+      // was created by the PartOffset == 0 branch above and create a load from
+      // the appropriate offset into it.
+      int64_t PartBegin = VA.getLocMemOffset();
+      int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+      int FI = MFI.getObjectIndexBegin();
+      for (; MFI.isFixedObjectIndex(FI); ++FI) {
+        int64_t ObjBegin = MFI.getObjectOffset(FI);
+        int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+        if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+          break;
+      }
+      if (MFI.isFixedObjectIndex(FI)) {
+        SDValue Addr =
+            DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+                        DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+        return DAG.getLoad(
+            ValVT, dl, Chain, Addr,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+                                              Ins[i].PartOffset));
+      }
     }
+  }
 
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue Val = DAG.getLoad(
-        ValVT, dl, Chain, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-    return ExtendedInMem ?
-      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+                                 VA.getLocMemOffset(), isImmutable);
+
+  // Set SExt or ZExt flag.
+  if (VA.getLocInfo() == CCValAssign::ZExt) {
+    MFI.setObjectZExt(FI, true);
+  } else if (VA.getLocInfo() == CCValAssign::SExt) {
+    MFI.setObjectSExt(FI, true);
   }
+
+  // Adjust SP offset of interrupt parameter.
+  if (CallConv == CallingConv::X86_INTR) {
+    MFI.setObjectOffset(FI, Offset);
+  }
+
+  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+  SDValue Val = DAG.getLoad(
+      ValVT, dl, Chain, FIN,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
+                       : Val;
 }
 
 // FIXME: Get this from tablegen.

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-abi-varargs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-abi-varargs.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-abi-varargs.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-abi-varargs.ll Wed Mar  1 15:42:00 2017
@@ -3,7 +3,7 @@
 ; rdar://13625505
 ; Here we have 9 fixed integer arguments the 9th argument in on stack, the
 ; varargs start right after at 8-byte alignment.
-define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
 ; CHECK-LABEL: fn9:
 ; 9th fixed argument
 ; CHECK: ldr {{w[0-9]+}}, [sp, #64]
@@ -30,7 +30,6 @@ define void @fn9(i32 %a1, i32 %a2, i32 %
   %a10 = alloca i32, align 4
   %a11 = alloca i32, align 4
   %a12 = alloca i32, align 4
-  store i32 %a1, i32* %1, align 4
   store i32 %a2, i32* %2, align 4
   store i32 %a3, i32* %3, align 4
   store i32 %a4, i32* %4, align 4
@@ -39,6 +38,7 @@ define void @fn9(i32 %a1, i32 %a2, i32 %
   store i32 %a7, i32* %7, align 4
   store i32 %a8, i32* %8, align 4
   store i32 %a9, i32* %9, align 4
+  store i32 %a9, i32* %a1
   %10 = bitcast i8** %args to i8*
   call void @llvm.va_start(i8* %10)
   %11 = va_arg i8** %args, i32
@@ -93,7 +93,7 @@ define i32 @main() nounwind ssp {
   %10 = load i32, i32* %a10, align 4
   %11 = load i32, i32* %a11, align 4
   %12 = load i32, i32* %a12, align 4
-  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  call void (i32*, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32* %a1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
   ret i32 0
 }
 

Added: llvm/trunk/test/CodeGen/ARM/arg-copy-elide.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/arg-copy-elide.ll?rev=296683&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/arg-copy-elide.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/arg-copy-elide.ll Wed Mar  1 15:42:00 2017
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=armv7-linux < %s | FileCheck %s
+
+declare arm_aapcscc void @addrof_i32(i32*)
+declare arm_aapcscc void @addrof_i64(i64*)
+
+define arm_aapcscc void @simple(i32, i32, i32, i32, i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret void
+}
+
+; CHECK-LABEL: simple:
+; CHECK: push {r11, lr}
+; CHECK: add r0, sp, #8
+; CHECK: bl addrof_i32
+; CHECK: pop {r11, pc}
+
+
+; We need to load %x before calling addrof_i32 now because it could mutate %x in
+; place.
+
+define arm_aapcscc i32 @use_arg(i32, i32, i32, i32, i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret i32 %x
+}
+
+; CHECK-LABEL: use_arg:
+; CHECK: push {[[csr:[^ ]*]], lr}
+; CHECK: ldr [[csr]], [sp, #8]
+; CHECK: add r0, sp, #8
+; CHECK: bl addrof_i32
+; CHECK: mov r0, [[csr]]
+; CHECK: pop {[[csr]], pc}
+
+
+define arm_aapcscc i64 @split_i64(i32, i32, i32, i32, i64 %x) {
+entry:
+  %x.addr = alloca i64, align 4
+  store i64 %x, i64* %x.addr, align 4
+  call void @addrof_i64(i64* %x.addr)
+  ret i64 %x
+}
+
+; CHECK-LABEL: split_i64:
+; CHECK: push    {r4, r5, r11, lr}
+; CHECK: sub     sp, sp, #8
+; CHECK: ldr     r4, [sp, #28]
+; CHECK: ldr     r5, [sp, #24]
+; CHECK: mov     r0, sp
+; CHECK: str     r4, [sp, #4]
+; CHECK: str     r5, [sp]
+; CHECK: bl      addrof_i64
+; CHECK: mov     r0, r5
+; CHECK: mov     r1, r4
+; CHECK: add     sp, sp, #8
+; CHECK: pop     {r4, r5, r11, pc}

Modified: llvm/trunk/test/CodeGen/Mips/o32_cc_vararg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Mips/o32_cc_vararg.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Mips/o32_cc_vararg.ll (original)
+++ llvm/trunk/test/CodeGen/Mips/o32_cc_vararg.ll Wed Mar  1 15:42:00 2017
@@ -236,8 +236,8 @@ entry:
   ret i32 %tmp
 
 ; CHECK-LABEL: va9:
-; CHECK: addiu   $sp, $sp, -32
-; CHECK: lw      $2, 52($sp)
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: lw      $2, 44($sp)
 }
 
 ; double

Modified: llvm/trunk/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll Wed Mar  1 15:42:00 2017
@@ -8,9 +8,10 @@ target triple = "i386-apple-darwin10.0.0
 @.str = internal constant [4 x i8] c"%p\0A\00"    ; <[4 x i8]*> [#uses=1]
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i8* (%struct.S*, i32, %struct.S*)* @_Z4test1SiS_ to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
 
-; Verify that %esi gets spilled before the call.
+; Verify that %s1 gets spilled before the call.
 ; CHECK: Z4test1SiS
-; CHECK: movl %esi,{{.*}}(%ebp) 
+; CHECK: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]],{{.*}}(%ebp) ## 4-byte Spill
 ; CHECK: calll __Z6throwsv
 
 define i8* @_Z4test1SiS_(%struct.S* byval %s1, i32 %n, %struct.S* byval %s2) ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {

Added: llvm/trunk/test/CodeGen/X86/arg-copy-elide.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/arg-copy-elide.ll?rev=296683&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/arg-copy-elide.ll (added)
+++ llvm/trunk/test/CodeGen/X86/arg-copy-elide.ll Wed Mar  1 15:42:00 2017
@@ -0,0 +1,280 @@
+; RUN: llc -mtriple=i686-windows < %s | FileCheck %s
+
+declare void @addrof_i32(i32*)
+declare void @addrof_i64(i64*)
+declare void @addrof_i128(i128*)
+declare void @addrof_i32_x3(i32*, i32*, i32*)
+
+define void @simple(i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret void
+}
+
+; CHECK-LABEL: _simple:
+; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; We need to load %x before calling addrof_i32 now because it could mutate %x in
+; place.
+
+define i32 @use_arg(i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret i32 %x
+}
+
+; CHECK-LABEL: _use_arg:
+; CHECK: pushl %[[csr:[^ ]*]]
+; CHECK-DAG: movl 8(%esp), %[[csr]]
+; CHECK-DAG: leal 8(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: movl %[[csr]], %eax
+; CHECK: popl %[[csr]]
+; CHECK: retl
+
+
+define i64 @split_i64(i64 %x) {
+entry:
+  %x.addr = alloca i64, align 4
+  store i64 %x, i64* %x.addr, align 4
+  call void @addrof_i64(i64* %x.addr)
+  ret i64 %x
+}
+
+; CHECK-LABEL: _split_i64:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %[[csr2:[^ ]*]]
+; CHECK: pushl %[[csr1:[^ ]*]]
+; CHECK: andl $-8, %esp
+; CHECK-DAG: movl 8(%ebp), %[[csr1]]
+; CHECK-DAG: movl 12(%ebp), %[[csr2]]
+; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK-DAG: movl %[[csr1]], %eax
+; CHECK-DAG: movl %[[csr2]], %edx
+; CHECK: leal -8(%ebp), %esp
+; CHECK: popl %[[csr1]]
+; CHECK: popl %[[csr2]]
+; CHECK: popl %ebp
+; CHECK: retl
+
+
+; We can't copy elide when an i64 is split between registers and memory in a
+; fastcc function.
+
+define fastcc i64 @fastcc_split_i64(i64* %p, i64 %x) {
+entry:
+  %x.addr = alloca i64, align 4
+  store i64 %x, i64* %x.addr, align 4
+  call void @addrof_i64(i64* %x.addr)
+  ret i64 %x
+}
+
+; CHECK-LABEL: _fastcc_split_i64:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
+; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
+; CHECK-DAG: movl %[[r2]], 4(%esp)
+; CHECK-DAG: movl %[[r1]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK: popl %ebp
+; CHECK: retl
+
+
+; We can't copy elide when it would reduce the user requested alignment.
+
+define void @high_alignment(i32 %x) {
+entry:
+  %x.p = alloca i32, align 128
+  store i32 %x, i32* %x.p
+  call void @addrof_i32(i32* %x.p)
+  ret void
+}
+
+; CHECK-LABEL: _high_alignment:
+; CHECK: andl $-128, %esp
+; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; We can't copy elide when it would reduce the ABI required alignment.
+; FIXME: We should lower the ABI alignment of i64 on Windows, since MSVC
+; doesn't guarantee it.
+
+define void @abi_alignment(i64 %x) {
+entry:
+  %x.p = alloca i64
+  store i64 %x, i64* %x.p
+  call void @addrof_i64(i64* %x.p)
+  ret void
+}
+
+; CHECK-LABEL: _abi_alignment:
+; CHECK: andl $-8, %esp
+; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK: retl
+
+
+; The code we generate for this is unimportant. This is mostly a crash test.
+
+define void @split_i128(i128* %sret, i128 %x) {
+entry:
+  %x.addr = alloca i128
+  store i128 %x, i128* %x.addr
+  call void @addrof_i128(i128* %x.addr)
+  store i128 %x, i128* %sret
+  ret void
+}
+
+; CHECK-LABEL: _split_i128:
+; CHECK: pushl %ebp
+; CHECK: calll _addrof_i128
+; CHECK: retl
+
+
+; Check that we load all of x, y, and z before the call.
+
+define i32 @three_args(i32 %x, i32 %y, i32 %z) {
+entry:
+  %z.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  store i32 %z, i32* %z.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @addrof_i32_x3(i32* %x.addr, i32* %y.addr, i32* %z.addr)
+  %s1 = add i32 %x, %y
+  %sum = add i32 %s1, %z
+  ret i32 %sum
+}
+
+; CHECK-LABEL: _three_args:
+; CHECK: pushl %[[csr:[^ ]*]]
+; CHECK-DAG: movl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: leal 8(%esp), %[[x:[^ ]*]]
+; CHECK-DAG: leal 12(%esp), %[[y:[^ ]*]]
+; CHECK-DAG: leal 16(%esp), %[[z:[^ ]*]]
+; CHECK: pushl %[[z]]
+; CHECK: pushl %[[y]]
+; CHECK: pushl %[[x]]
+; CHECK: calll _addrof_i32_x3
+; CHECK: movl %[[csr]], %eax
+; CHECK: popl %[[csr]]
+; CHECK: retl
+
+
+define void @two_args_same_alloca(i32 %x, i32 %y) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  store i32 %y, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret void
+}
+
+; CHECK-LABEL: _two_args_same_alloca:
+; CHECK: movl 8(%esp), {{.*}}
+; CHECK: movl {{.*}}, 4(%esp)
+; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+define void @avoid_byval(i32* byval %x) {
+entry:
+  %x.p.p = alloca i32*
+  store i32* %x, i32** %x.p.p
+  call void @addrof_i32(i32* %x)
+  ret void
+}
+
+; CHECK-LABEL: _avoid_byval:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+define void @avoid_inalloca(i32* inalloca %x) {
+entry:
+  %x.p.p = alloca i32*
+  store i32* %x, i32** %x.p.p
+  call void @addrof_i32(i32* %x)
+  ret void
+}
+
+; CHECK-LABEL: _avoid_inalloca:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; Don't elide the copy when the alloca is escaped with a store.
+
+define void @escape_with_store(i32 %x) {
+  %x1 = alloca i32
+  %x2 = alloca i32*
+  store i32* %x1, i32** %x2
+  %x3 = load i32*, i32** %x2
+  store i32 0, i32* %x3
+  store i32 %x, i32* %x1
+  call void @addrof_i32(i32* %x1)
+  ret void
+}
+
+; CHECK-LABEL: _escape_with_store:
+; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
+; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: calll _addrof_i32
+
+
+; This test case exposed issues with the use of TokenFactor.
+
+define void @sret_and_elide(i32* sret %sret, i32 %v) {
+  %v.p = alloca i32
+  store i32 %v, i32* %v.p
+  call void @addrof_i32(i32* %v.p)
+  store i32 %v, i32* %sret
+  ret void
+}
+
+; CHECK-LABEL: _sret_and_elide:
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: movl 12(%esp), %[[sret:[^ ]*]]
+; CHECK: movl 16(%esp), %[[v:[^ ]*]]
+; CHECK: leal 16(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: movl %[[v]], (%[[sret]])
+; CHECK: movl %[[sret]], %eax
+; CHECK: popl
+; CHECK: popl
+; CHECK: retl

Modified: llvm/trunk/test/CodeGen/X86/inline-asm-tied.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/inline-asm-tied.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/inline-asm-tied.ll (original)
+++ llvm/trunk/test/CodeGen/X86/inline-asm-tied.ll Wed Mar  1 15:42:00 2017
@@ -1,31 +1,27 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s
 ; rdar://6992609
 
-; CHECK: movl %ecx, 4([[ESP:%e..]])
-; CHECK: movl 4([[ESP]]), [[EDX:%e..]]
-; CHECK: movl [[EDX]], 4([[ESP]])
 target triple = "i386-apple-darwin9.0"
- at llvm.used = appending global [1 x i8*] [i8* bitcast (i64 (i64)* @_OSSwapInt64 to i8*)], section "llvm.metadata"		; <[1 x i8*]*> [#uses=0]
 
 define i64 @_OSSwapInt64(i64 %_data) nounwind {
 entry:
-	%retval = alloca i64		; <i64*> [#uses=2]
-	%_data.addr = alloca i64		; <i64*> [#uses=4]
-	store i64 %_data, i64* %_data.addr
-	%tmp = load i64, i64* %_data.addr		; <i64> [#uses=1]
-	%0 = call i64 asm "bswap   %eax\0A\09bswap   %edx\0A\09xchgl   %eax, %edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %tmp) nounwind		; <i64> [#uses=1]
-	store i64 %0, i64* %_data.addr
-	%tmp1 = load i64, i64* %_data.addr		; <i64> [#uses=1]
-	store i64 %tmp1, i64* %retval
-	%1 = load i64, i64* %retval		; <i64> [#uses=1]
-	ret i64 %1
+  %0 = call i64 asm "bswap   %eax\0A\09bswap   %edx\0A\09xchgl   %eax, %%edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %_data) nounwind
+  ret i64 %0
 }
 
+; CHECK-LABEL: __OSSwapInt64:
+; CHECK-DAG: movl 8(%esp), %edx
+; CHECK-DAG: movl 4(%esp), %eax
+; CHECK: ## InlineAsm Start
+; CHECK: ## InlineAsm End
+;       Everything is set up in EAX:EDX, return immediately.
+; CHECK-NEXT: retl
+
 ; The tied operands are not necessarily in the same order as the defs.
 ; PR13742
 define i64 @swapped(i64 %x, i64 %y) nounwind {
 entry:
-	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
-        %x1 = extractvalue { i64, i64 } %x0, 0
-        ret i64 %x1
+  %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+  %x1 = extractvalue { i64, i64 } %x0, 0
+  ret i64 %x1
 }

Modified: llvm/trunk/test/CodeGen/X86/pr30430.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr30430.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr30430.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr30430.ll Wed Mar  1 15:42:00 2017
@@ -30,14 +30,6 @@ define <16 x float> @makefloat(float %f1
 ; CHECK-NEXT:    vmovss %xmm5, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm6, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm14, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm8, (%rsp)
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -46,14 +38,14 @@ define <16 x float> @makefloat(float %f1
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm15 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm16 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm17 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm18 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm19 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm20 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm21 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm22 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm23 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm2, {{[0-9]+}}(%rsp)
@@ -62,14 +54,14 @@ define <16 x float> @makefloat(float %f1
 ; CHECK-NEXT:    vmovss %xmm5, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm6, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm14, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm17, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm18, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm19, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm20, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm21, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm22, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm23, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -104,11 +96,19 @@ define <16 x float> @makefloat(float %f1
 ; CHECK-NEXT:    # implicit-def: %YMM3
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm3
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm3
-; CHECK-NEXT:    # implicit-def: %ZMM16
-; CHECK-NEXT:    vmovaps %zmm3, %zmm16
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm2, %zmm16, %zmm16
-; CHECK-NEXT:    vmovaps %zmm16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    # implicit-def: %ZMM24
+; CHECK-NEXT:    vmovaps %zmm3, %zmm24
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
+; CHECK-NEXT:    vmovaps %zmm24, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
+; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm8, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm14, (%rsp) # 4-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll Wed Mar  1 15:42:00 2017
@@ -1653,12 +1653,8 @@ define <4 x float> @test_mm_set1_ps(floa
 define void @test_mm_setcsr(i32 %a0) nounwind {
 ; X32-LABEL: test_mm_setcsr:
 ; X32:       # BB#0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %esp, %ecx
-; X32-NEXT:    movl %eax, (%esp)
-; X32-NEXT:    ldmxcsr (%ecx)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    leal 4(%esp), %eax
+; X32-NEXT:    ldmxcsr (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_setcsr:

Modified: llvm/trunk/test/DebugInfo/X86/discriminator.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/DebugInfo/X86/discriminator.ll?rev=296683&r1=296682&r2=296683&view=diff
==============================================================================
--- llvm/trunk/test/DebugInfo/X86/discriminator.ll (original)
+++ llvm/trunk/test/DebugInfo/X86/discriminator.ll Wed Mar  1 15:42:00 2017
@@ -59,4 +59,4 @@ attributes #0 = { nounwind uwtable "less
 
 ; CHECK: Address            Line   Column File   ISA Discriminator Flags
 ; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
-; CHECK: 0x0000000000000011      2      0      1   0            42 {{$}}
+; CHECK: 0x000000000000000a      2      0      1   0            42 {{$}}