[llvm] r227752 - [X86] Convert esp-relative movs of function arguments to pushes, step 2

Sun Feb 1 08:56:05 PST 2015

Author: mkuper
Date: Sun Feb  1 10:56:04 2015
New Revision: 227752

URL: http://llvm.org/viewvc/llvm-project?rev=227752&view=rev
Log:
[X86] Convert esp-relative movs of function arguments to pushes, step 2

This moves the transformation introduced in r223757 into a separate MI pass.
This allows it to cover many more cases (not only cases where there must be a 
reserved call frame), and perform rudimentary call folding. It still doesn't 
have a heuristic, so it is enabled only for optsize/minsize, with stack 
alignment <= 8, where it ought to be a fairly clear win.

(Re-commit of r227728)

Differential Revision: http://reviews.llvm.org/D6789

Added:
    llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
      - copied, changed from r227745, llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
Modified:
    llvm/trunk/include/llvm/Target/TargetFrameLowering.h
    llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp
    llvm/trunk/lib/CodeGen/TargetFrameLoweringImpl.cpp
    llvm/trunk/lib/Target/X86/CMakeLists.txt
    llvm/trunk/lib/Target/X86/X86.h
    llvm/trunk/lib/Target/X86/X86FastISel.cpp
    llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
    llvm/trunk/lib/Target/X86/X86FrameLowering.h
    llvm/trunk/lib/Target/X86/X86InstrCompiler.td
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/lib/Target/X86/X86InstrInfo.h
    llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h
    llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp
    llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
    llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll
    llvm/trunk/test/CodeGen/X86/movtopush.ll

Modified: llvm/trunk/include/llvm/Target/TargetFrameLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetFrameLowering.h?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Target/TargetFrameLowering.h (original)
+++ llvm/trunk/include/llvm/Target/TargetFrameLowering.h Sun Feb  1 10:56:04 2015
@@ -193,6 +193,11 @@ public:
     return hasReservedCallFrame(MF) || hasFP(MF);
   }
 
+  // needsFrameIndexResolution - Do we need to perform FI resolution for
+  // this function. Normally, this is required only when the function
+  // has any stack objects. However, targets may want to override this.
+  virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
+
   /// getFrameIndexOffset - Returns the displacement from the frame register to
   /// the stack frame of the specified index.
   virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;

Modified: llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp (original)
+++ llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp Sun Feb  1 10:56:04 2015
@@ -703,7 +703,8 @@ void PEI::insertPrologEpilogCode(Machine
 /// register references and actual offsets.
 ///
 void PEI::replaceFrameIndices(MachineFunction &Fn) {
-  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(Fn)) return;
 
   // Store SPAdj at exit of a basic block.
   SmallVector<int, 8> SPState;
@@ -769,13 +770,6 @@ void PEI::replaceFrameIndices(MachineBas
       continue;
     }
 
-    // If we are looking at a call sequence, we need to keep track of
-    // the SP adjustment made by each instruction in the sequence.
-    // This includes both the frame setup/destroy pseudos (handled above),
-    // as well as other instructions that have side effects w.r.t the SP.
-    if (InsideCallSequence)
-      SPAdj += TII.getSPAdjust(I);
-
     MachineInstr *MI = I;
     bool DoIncr = true;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -854,6 +848,16 @@ void PEI::replaceFrameIndices(MachineBas
       break;
     }
 
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because 
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (MI && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
     if (DoIncr && I != BB->end()) ++I;
 
     // Update register states.

Modified: llvm/trunk/lib/CodeGen/TargetFrameLoweringImpl.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetFrameLoweringImpl.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetFrameLoweringImpl.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetFrameLoweringImpl.cpp Sun Feb  1 10:56:04 2015
@@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexRe
   FrameReg = RI->getFrameRegister(MF);
   return getFrameIndexOffset(MF, FI);
 }
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}

Modified: llvm/trunk/lib/Target/X86/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/CMakeLists.txt?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/X86/CMakeLists.txt Sun Feb  1 10:56:04 2015
@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTabl
 
 set(sources
   X86AsmPrinter.cpp
+  X86CallFrameOptimization.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp

Modified: llvm/trunk/lib/Target/X86/X86.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.h?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.h (original)
+++ llvm/trunk/lib/Target/X86/X86.h Sun Feb  1 10:56:04 2015
@@ -64,6 +64,11 @@ FunctionPass *createX86PadShortFunctions
 /// to eliminate execution delays in some Atom processors.
 FunctionPass *createX86FixupLEAs();
 
+/// createX86CallFrameOptimization - Return a pass that optimizes
+/// the code-size of x86 call sequences. This is done by replacing
+/// esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
 } // End llvm namespace
 
 #endif

Copied: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp (from r227745, llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp?p2=llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp&p1=llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp&r1=227745&r2=227752&rev=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp Sun Feb  1 10:56:04 2015
@@ -1,400 +1,400 @@
-//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a pass that optimizes call sequences on x86.
-// Currently, it converts movs of function parameters onto the stack into 
-// pushes. This is beneficial for two main reasons:
-// 1) The push instruction encoding is much smaller than an esp-relative mov
-// 2) It is possible to push memory arguments directly. So, if the
-//    the transformation is preformed pre-reg-alloc, it can help relieve
-//    register pressure.
-//
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-#include "X86MachineFunctionInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-cf-opt"
-
-cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
-              cl::desc("Avoid optimizing x86 call frames for size"),
-              cl::init(false), cl::Hidden);
-
-namespace {
-class X86CallFrameOptimization : public MachineFunctionPass {
-public:
-  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-private:
-  bool shouldPerformTransformation(MachineFunction &MF);
-
-  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator I);
-
-  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
-                                   unsigned Reg);
-
-  const char *getPassName() const override {
-    return "X86 Optimize Call Frame";
-  }
-
-  const TargetInstrInfo *TII;
-  const TargetFrameLowering *TFL;
-  const MachineRegisterInfo *MRI;
-  static char ID;
-};
-
-char X86CallFrameOptimization::ID = 0;
-}
-
-FunctionPass *llvm::createX86CallFrameOptimization() {
-  return new X86CallFrameOptimization();
-}
-
-// This checks whether the transformation is legal and profitable
-bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
-  if (NoX86CFOpt.getValue())
-    return false;
-
-  // We currently only support call sequences where *all* parameters.
-  // are passed on the stack.
-  // No point in running this in 64-bit mode, since some arguments are
-  // passed in-register in all common calling conventions, so the pattern
-  // we're looking for will never match.
-  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
-  if (STI.is64Bit())
-    return false;
-
-  // You would expect straight-line code between call-frame setup and
-  // call-frame destroy. You would be wrong. There are circumstances (e.g.
-  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
-  // end up with the setup and the destroy in different basic blocks.
-  // This is bad, and breaks SP adjustment.
-  // So, check that all of the frames in the function are closed inside
-  // the same block, and, for good measure, that there are no nested frames.
-  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
-  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
-  for (MachineBasicBlock &BB : MF) {
-    bool InsideFrameSequence = false;
-    for (MachineInstr &MI : BB) {
-      if (MI.getOpcode() == FrameSetupOpcode) {
-        if (InsideFrameSequence)
-          return false;
-        InsideFrameSequence = true;
-      }
-      else if (MI.getOpcode() == FrameDestroyOpcode) {
-        if (!InsideFrameSequence)
-          return false;
-        InsideFrameSequence = false;
-      }
-    }
-
-    if (InsideFrameSequence)
-      return false;
-  }
-
-  // Now that we know the transformation is legal, check if it is
-  // profitable.
-  // TODO: Add a heuristic that actually looks at the function,
-  //       and enable this for more cases.
-
-  // This transformation is always a win when we expected to have
-  // a reserved call frame. Under other circumstances, it may be either 
-  // a win or a loss, and requires a heuristic.
-  // For now, enable it only for the relatively clear win cases.
-  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
-  if (CannotReserveFrame)
-    return true;
-
-  // For now, don't even try to evaluate the profitability when
-  // not optimizing for size.
-  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-  bool OptForSize =
-    FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-    Attribute::OptimizeForSize) ||
-    FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-
-  if (!OptForSize)
-    return false;
-
-  // Stack re-alignment can make this unprofitable even in terms of size.
-  // As mentioned above, a better heuristic is needed. For now, don't do this
-  // when the required alignment is above 8. (4 would be the safe choice, but
-  // some experimentation showed 8 is generally good).
-  if (TFL->getStackAlignment() > 8)
-    return false;
-
-  return true;
-}
-
-bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getSubtarget().getInstrInfo();
-  TFL = MF.getSubtarget().getFrameLowering();
-  MRI = &MF.getRegInfo();
-
-  if (!shouldPerformTransformation(MF))
-    return false;
-
-  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
-
-  bool Changed = false;
-
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-      if (I->getOpcode() == FrameSetupOpcode)
-        Changed |= adjustCallSequence(MF, *BB, I);
-
-  return Changed;
-}
-
-bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
-                                                MachineBasicBlock &MBB,
-                                                MachineBasicBlock::iterator I) {
-
-  // Check that this particular call sequence is amenable to the
-  // transformation.
-  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-                                       MF.getSubtarget().getRegisterInfo());
-  unsigned StackPtr = RegInfo.getStackRegister();
-  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
-
-  // We expect to enter this at the beginning of a call sequence
-  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
-  MachineBasicBlock::iterator FrameSetup = I++;
-
-  
-  // For globals in PIC mode, we can have some LEAs here.
-  // Ignore them, they don't bother us.
-  // TODO: Extend this to something that covers more cases.
-  while (I->getOpcode() == X86::LEA32r)
-    ++I;
-  
-  // We expect a copy instruction here.
-  // TODO: The copy instruction is a lowering artifact.
-  //       We should also support a copy-less version, where the stack
-  //       pointer is used directly.
-  if (!I->isCopy() || !I->getOperand(0).isReg())
-    return false;
-  MachineBasicBlock::iterator SPCopy = I++;
-  StackPtr = SPCopy->getOperand(0).getReg();
-
-  // Scan the call setup sequence for the pattern we're looking for.
-  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
-  // instructions, that push a sequence of 32-bit values onto the stack, with
-  // no gaps between them.
-  SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
-  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
-  if (MaxAdjust > 4)
-    MovVector.resize(MaxAdjust, nullptr);
-
-  do {
-    int Opcode = I->getOpcode();
-    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
-      break;
-
-    // We only want movs of the form:
-    // movl imm/r32, k(%esp)
-    // If we run into something else, bail.
-    // Note that AddrBaseReg may, counter to its name, not be a register,
-    // but rather a frame index.
-    // TODO: Support the fi case. This should probably work now that we
-    // have the infrastructure to track the stack pointer within a call
-    // sequence.
-    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
-        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
-        !I->getOperand(X86::AddrScaleAmt).isImm() ||
-        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
-        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
-        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
-        !I->getOperand(X86::AddrDisp).isImm())
-      return false;
-
-    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
-    assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
-
-    // We really don't want to consider the unaligned case.
-    if (StackDisp % 4)
-      return false;
-    StackDisp /= 4;
-
-    assert((size_t)StackDisp < MovVector.size() &&
-      "Function call has more parameters than the stack is adjusted for.");
-
-    // If the same stack slot is being filled twice, something's fishy.
-    if (MovVector[StackDisp] != nullptr)
-      return false;
-    MovVector[StackDisp] = I;
-
-    ++I;
-  } while (I != MBB.end());
-
-  // We now expect the end of the sequence - a call and a stack adjust.
-  if (I == MBB.end())
-    return false;
-
-  // For PCrel calls, we expect an additional COPY of the basereg.
-  // If we find one, skip it.
-  if (I->isCopy()) {
-    if (I->getOperand(1).getReg() ==
-      MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
-      ++I;
-    else
-      return false;
-  }
-
-  if (!I->isCall())
-    return false;
-  MachineBasicBlock::iterator Call = I;
-  if ((++I)->getOpcode() != FrameDestroyOpcode)
-    return false;
-
-  // Now, go through the vector, and see that we don't have any gaps,
-  // but only a series of 32-bit MOVs.
-  
-  int64_t ExpectedDist = 0;
-  auto MMI = MovVector.begin(), MME = MovVector.end();
-  for (; MMI != MME; ++MMI, ExpectedDist += 4)
-    if (*MMI == nullptr)
-      break;
-  
-  // If the call had no parameters, do nothing
-  if (!ExpectedDist)
-    return false;
-
-  // We are either at the last parameter, or a gap. 
-  // Make sure it's not a gap
-  for (; MMI != MME; ++MMI)
-    if (*MMI != nullptr)
-      return false;
-
-  // Ok, we can in fact do the transformation for this call.
-  // Do not remove the FrameSetup instruction, but adjust the parameters.
-  // PEI will end up finalizing the handling of this.
-  FrameSetup->getOperand(1).setImm(ExpectedDist);
-
-  DebugLoc DL = I->getDebugLoc();
-  // Now, iterate through the vector in reverse order, and replace the movs
-  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 
-  // replace uses.
-  for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
-    MachineBasicBlock::iterator MOV = *MovVector[Idx];
-    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
-    if (MOV->getOpcode() == X86::MOV32mi) {
-      unsigned PushOpcode = X86::PUSHi32;
-      // If the operand is a small (8-bit) immediate, we can use a
-      // PUSH instruction with a shorter encoding.
-      // Note that isImm() may fail even though this is a MOVmi, because
-      // the operand can also be a symbol.
-      if (PushOp.isImm()) {
-        int64_t Val = PushOp.getImm();
-        if (isInt<8>(Val))
-          PushOpcode = X86::PUSH32i8;
-      }
-      BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
-    } else {
-      unsigned int Reg = PushOp.getReg();
-
-      // If PUSHrmm is not slow on this target, try to fold the source of the
-      // push into the instruction.
-      const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
-      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
-
-      // Check that this is legal to fold. Right now, we're extremely
-      // conservative about that.
-      MachineInstr *DefMov = nullptr;
-      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
-        MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
-
-        unsigned NumOps = DefMov->getDesc().getNumOperands();
-        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
-          Push->addOperand(DefMov->getOperand(i));
-
-        DefMov->eraseFromParent();
-      } else {
-        BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
-      }
-    }
-
-    MBB.erase(MOV);
-  }
-
-  // The stack-pointer copy is no longer used in the call sequences.
-  // There should not be any other users, but we can't commit to that, so:
-  if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
-    SPCopy->eraseFromParent();
-
-  // Once we've done this, we need to make sure PEI doesn't assume a reserved
-  // frame.
-  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-  FuncInfo->setHasPushSequences(true);
-
-  return true;
-}
-
-MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
-    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
-  // Do an extremely restricted form of load folding.
-  // ISel will often create patterns like:
-  // movl    4(%edi), %eax
-  // movl    8(%edi), %ecx
-  // movl    12(%edi), %edx
-  // movl    %edx, 8(%esp)
-  // movl    %ecx, 4(%esp)
-  // movl    %eax, (%esp)
-  // call
-  // Get rid of those with prejudice.
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
-    return nullptr;
-
-  // Make sure this is the only use of Reg.
-  if (!MRI->hasOneNonDBGUse(Reg))
-    return nullptr;
-
-  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
-
-  // Make sure the def is a MOV from memory.
-  // If the def is an another block, give up.
-  if (DefMI->getOpcode() != X86::MOV32rm ||
-      DefMI->getParent() != FrameSetup->getParent())
-    return nullptr;
-
-  // Be careful with movs that load from a stack slot, since it may get
-  // resolved incorrectly.
-  // TODO: Again, we already have the infrastructure, so this should work.
-  if (!DefMI->getOperand(1).isReg())
-    return nullptr;
-
-  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
-  // of MOVs. To be less conservative would require duplicating a lot of the
-  // logic from PeepholeOptimizer.
-  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
-  // to be smarter about folding into pushes. 
-  for (auto I = DefMI; I != FrameSetup; ++I)
-    if (I->getOpcode() != X86::MOV32rm)
-      return nullptr;
-
-  return DefMI;
-}
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into 
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+//    the transformation is preformed pre-reg-alloc, it can help relieve
+//    register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
+              cl::desc("Avoid optimizing x86 call frames for size"),
+              cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  bool shouldPerformTransformation(MachineFunction &MF);
+
+  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I);
+
+  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+                                   unsigned Reg);
+
+  const char *getPassName() const override {
+    return "X86 Optimize Call Frame";
+  }
+
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFL;
+  const MachineRegisterInfo *MRI;
+  static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal and profitable
+bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
+  if (NoX86CFOpt.getValue())
+    return false;
+
+  // We currently only support call sequences where *all* parameters.
+  // are passed on the stack.
+  // No point in running this in 64-bit mode, since some arguments are
+  // passed in-register in all common calling conventions, so the pattern
+  // we're looking for will never match.
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  if (STI.is64Bit())
+    return false;
+
+  // You would expect straight-line code between call-frame setup and
+  // call-frame destroy. You would be wrong. There are circumstances (e.g.
+  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+  // end up with the setup and the destroy in different basic blocks.
+  // This is bad, and breaks SP adjustment.
+  // So, check that all of the frames in the function are closed inside
+  // the same block, and, for good measure, that there are no nested frames.
+  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  for (MachineBasicBlock &BB : MF) {
+    bool InsideFrameSequence = false;
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == FrameSetupOpcode) {
+        if (InsideFrameSequence)
+          return false;
+        InsideFrameSequence = true;
+      }
+      else if (MI.getOpcode() == FrameDestroyOpcode) {
+        if (!InsideFrameSequence)
+          return false;
+        InsideFrameSequence = false;
+      }
+    }
+
+    if (InsideFrameSequence)
+      return false;
+  }
+
+  // Now that we know the transformation is legal, check if it is
+  // profitable.
+  // TODO: Add a heuristic that actually looks at the function,
+  //       and enable this for more cases.
+
+  // This transformation is always a win when we expected to have
+  // a reserved call frame. Under other circumstances, it may be either 
+  // a win or a loss, and requires a heuristic.
+  // For now, enable it only for the relatively clear win cases.
+  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+  if (CannotReserveFrame)
+    return true;
+
+  // For now, don't even try to evaluate the profitability when
+  // not optimizing for size.
+  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+  bool OptForSize =
+    FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+    Attribute::OptimizeForSize) ||
+    FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  if (!OptForSize)
+    return false;
+
+  // Stack re-alignment can make this unprofitable even in terms of size.
+  // As mentioned above, a better heuristic is needed. For now, don't do this
+  // when the required alignment is above 8. (4 would be the safe choice, but
+  // some experimentation showed 8 is generally good).
+  if (TFL->getStackAlignment() > 8)
+    return false;
+
+  return true;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TFL = MF.getSubtarget().getFrameLowering();
+  MRI = &MF.getRegInfo();
+
+  if (!shouldPerformTransformation(MF))
+    return false;
+
+  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+  bool Changed = false;
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode)
+        Changed |= adjustCallSequence(MF, *BB, I);
+
+  return Changed;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+                                                MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator I) {
+
+  // Check that this particular call sequence is amenable to the
+  // transformation.
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+                                       MF.getSubtarget().getRegisterInfo());
+  unsigned StackPtr = RegInfo.getStackRegister();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  // We expect to enter this at the beginning of a call sequence
+  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+  MachineBasicBlock::iterator FrameSetup = I++;
+
+  
+  // For globals in PIC mode, we can have some LEAs here.
+  // Ignore them, they don't bother us.
+  // TODO: Extend this to something that covers more cases.
+  while (I->getOpcode() == X86::LEA32r)
+    ++I;
+  
+  // We expect a copy instruction here.
+  // TODO: The copy instruction is a lowering artifact.
+  //       We should also support a copy-less version, where the stack
+  //       pointer is used directly.
+  if (!I->isCopy() || !I->getOperand(0).isReg())
+    return false;
+  MachineBasicBlock::iterator SPCopy = I++;
+  StackPtr = SPCopy->getOperand(0).getReg();
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // no gaps between them.
+  SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
+  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+  if (MaxAdjust > 4)
+    MovVector.resize(MaxAdjust, nullptr);
+
+  do {
+    int Opcode = I->getOpcode();
+    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+      break;
+
+    // We only want movs of the form:
+    // movl imm/r32, k(%esp)
+    // If we run into something else, bail.
+    // Note that AddrBaseReg may, counter to its name, not be a register,
+    // but rather a frame index.
+    // TODO: Support the fi case. This should probably work now that we
+    // have the infrastructure to track the stack pointer within a call
+    // sequence.
+    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return false;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
+
+    // We really don't want to consider the unaligned case.
+    if (StackDisp % 4)
+      return false;
+    StackDisp /= 4;
+
+    assert((size_t)StackDisp < MovVector.size() &&
+      "Function call has more parameters than the stack is adjusted for.");
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (MovVector[StackDisp] != nullptr)
+      return false;
+    MovVector[StackDisp] = I;
+
+    ++I;
+  } while (I != MBB.end());
+
+  // We now expect the end of the sequence - a call and a stack adjust.
+  if (I == MBB.end())
+    return false;
+
+  // For PCrel calls, we expect an additional COPY of the basereg.
+  // If we find one, skip it.
+  if (I->isCopy()) {
+    if (I->getOperand(1).getReg() ==
+      MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
+      ++I;
+    else
+      return false;
+  }
+
+  if (!I->isCall())
+    return false;
+  MachineBasicBlock::iterator Call = I;
+  if ((++I)->getOpcode() != FrameDestroyOpcode)
+    return false;
+
+  // Now, go through the vector, and see that we don't have any gaps,
+  // but only a series of 32-bit MOVs.
+  
+  int64_t ExpectedDist = 0;
+  auto MMI = MovVector.begin(), MME = MovVector.end();
+  for (; MMI != MME; ++MMI, ExpectedDist += 4)
+    if (*MMI == nullptr)
+      break;
+  
+  // If the call had no parameters, do nothing
+  if (!ExpectedDist)
+    return false;
+
+  // We are either at the last parameter, or a gap. 
+  // Make sure it's not a gap
+  for (; MMI != MME; ++MMI)
+    if (*MMI != nullptr)
+      return false;
+
+  // Ok, we can in fact do the transformation for this call.
+  // Do not remove the FrameSetup instruction, but adjust the parameters.
+  // PEI will end up finalizing the handling of this.
+  FrameSetup->getOperand(1).setImm(ExpectedDist);
+
+  DebugLoc DL = I->getDebugLoc();
+  // Now, iterate through the vector in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 
+  // replace uses.
+  for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+    MachineBasicBlock::iterator MOV = *MovVector[Idx];
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    if (MOV->getOpcode() == X86::MOV32mi) {
+      unsigned PushOpcode = X86::PUSHi32;
+      // If the operand is a small (8-bit) immediate, we can use a
+      // PUSH instruction with a shorter encoding.
+      // Note that isImm() may fail even though this is a MOVmi, because
+      // the operand can also be a symbol.
+      if (PushOp.isImm()) {
+        int64_t Val = PushOp.getImm();
+        if (isInt<8>(Val))
+          PushOpcode = X86::PUSH32i8;
+      }
+      BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+    } else {
+      unsigned int Reg = PushOp.getReg();
+
+      // If PUSHrmm is not slow on this target, try to fold the source of the
+      // push into the instruction.
+      const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+
+      // Check that this is legal to fold. Right now, we're extremely
+      // conservative about that.
+      MachineInstr *DefMov = nullptr;
+      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+        MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
+
+        unsigned NumOps = DefMov->getDesc().getNumOperands();
+        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+          Push->addOperand(DefMov->getOperand(i));
+
+        DefMov->eraseFromParent();
+      } else {
+        BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
+      }
+    }
+
+    MBB.erase(MOV);
+  }
+
+  // The stack-pointer copy is no longer used in the call sequences.
+  // There should not be any other users, but we can't commit to that, so:
+  if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
+    SPCopy->eraseFromParent();
+
+  // Once we've done this, we need to make sure PEI doesn't assume a reserved
+  // frame.
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  FuncInfo->setHasPushSequences(true);
+
+  return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+  // Do an extremely restricted form of load folding.
+  // ISel will often create patterns like:
+  // movl    4(%edi), %eax
+  // movl    8(%edi), %ecx
+  // movl    12(%edi), %edx
+  // movl    %edx, 8(%esp)
+  // movl    %ecx, 4(%esp)
+  // movl    %eax, (%esp)
+  // call
+  // Get rid of those with prejudice.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+
+  // Make sure this is the only use of Reg.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return nullptr;
+
+  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+  // Make sure the def is a MOV from memory.
+  // If the def is an another block, give up.
+  if (DefMI->getOpcode() != X86::MOV32rm ||
+      DefMI->getParent() != FrameSetup->getParent())
+    return nullptr;
+
+  // Be careful with movs that load from a stack slot, since it may get
+  // resolved incorrectly.
+  // TODO: Again, we already have the infrastructure, so this should work.
+  if (!DefMI->getOperand(1).isReg())
+    return nullptr;
+
+  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
+  // of MOVs. To be less conservative would require duplicating a lot of the
+  // logic from PeepholeOptimizer.
+  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
+  // to be smarter about folding into pushes. 
+  for (auto I = DefMI; I != FrameSetup; ++I)
+    if (I->getOpcode() != X86::MOV32rm)
+      return nullptr;
+
+  return DefMI;
+}

Modified: llvm/trunk/lib/Target/X86/X86FastISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FastISel.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86FastISel.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FastISel.cpp Sun Feb  1 10:56:04 2015
@@ -2735,7 +2735,7 @@ bool X86FastISel::fastLowerCall(CallLowe
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(

Modified: llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FrameLowering.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86FrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.cpp Sun Feb  1 10:56:04 2015
@@ -38,7 +38,34 @@ using namespace llvm;
 extern cl::opt<bool> ForceStackAlign;
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
+  return !MF.getFrameInfo()->hasVarSizedObjects() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
+                               (MF.getSubtarget().getRegisterInfo());
+  return hasReservedCallFrame(MF) ||
+         (hasFP(MF) && !TRI->needsStackRealignment(MF))
+         || TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy 
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects() ||
+         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -101,16 +128,6 @@ static unsigned getANDriOpcode(bool IsLP
   return X86::AND32ri;
 }
 
-static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) {
-  // We don't support LP64 for now.
-  assert(!IsLP64);
-
-  if (MO.isImm() && isInt<8>(MO.getImm()))
-    return X86::PUSH32i8;
-
-  return X86::PUSHi32;;
-}
-
 static unsigned getLEArOpcode(unsigned IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
@@ -1917,100 +1934,6 @@ void X86FrameLowering::adjustForHiPEProl
 #endif
 }
 
-bool X86FrameLowering::
-convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I, uint64_t Amount) const {
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-    MF.getSubtarget().getRegisterInfo());
-  unsigned StackPtr = RegInfo.getStackRegister();
-
-  // Scan the call setup sequence for the pattern we're looking for.
-  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
-  // instructions, that push a sequence of 32-bit values onto the stack, with
-  // no gaps.  
-  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
-  do {
-    int Opcode = I->getOpcode();
-    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
-      break;
- 
-    // We only want movs of the form:
-    // movl imm/r32, k(%ecx)
-    // If we run into something else, bail
-    // Note that AddrBaseReg may, counterintuitively, not be a register...
-    if (!I->getOperand(X86::AddrBaseReg).isReg() || 
-        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
-        !I->getOperand(X86::AddrScaleAmt).isImm() ||
-        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
-        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
-        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
-        !I->getOperand(X86::AddrDisp).isImm())
-      return false;
-
-    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
-    
-    // We don't want to consider the unaligned case.
-    if (StackDisp % 4)
-      return false;
-
-    // If the same stack slot is being filled twice, something's fishy.
-    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
-      return false;
-
-    ++I;
-  } while (I != MBB.end());
-
-  // We now expect the end of the sequence - a call and a stack adjust.
-  if (I == MBB.end())
-    return false;
-  if (!I->isCall())
-    return false;
-  MachineBasicBlock::iterator Call = I;
-  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
-    return false;
-
-  // Now, go through the map, and see that we don't have any gaps,
-  // but only a series of 32-bit MOVs.
-  // Since std::map provides ordered iteration, the original order
-  // of the MOVs doesn't matter.
-  int64_t ExpectedDist = 0;
-  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; 
-       ++MMI, ExpectedDist += 4)
-    if (MMI->first != ExpectedDist)
-      return false;
-
-  // Ok, everything looks fine. Do the transformation.
-  DebugLoc DL = I->getDebugLoc();
-
-  // It's possible the original stack adjustment amount was larger than
-  // that done by the pushes. If so, we still need a SUB.
-  Amount -= ExpectedDist;
-  if (Amount) {
-    MachineInstr* Sub = BuildMI(MBB, Call, DL,
-                          TII.get(getSUBriOpcode(false, Amount)), StackPtr)
-                  .addReg(StackPtr).addImm(Amount);
-    Sub->getOperand(3).setIsDead();
-  }
-
-  // Now, iterate through the map in reverse order, and replace the movs
-  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
-  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
-    MachineBasicBlock::iterator MOV = MMI->second;
-    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
-
-    // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size
-    int PushOpcode = X86::PUSH32r;
-    if (MOV->getOpcode() == X86::MOV32mi)
-      PushOpcode = getPUSHiOpcode(false, PushOp);
-
-    BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp);
-    MBB.erase(MOV);
-  }
-
-  return true;
-}
-
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -2025,7 +1948,7 @@ eliminateCallFramePseudoInstr(MachineFun
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
   uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
-  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
   I = MBB.erase(I);
 
   if (!reserveCallFrame) {
@@ -2045,24 +1968,18 @@ eliminateCallFramePseudoInstr(MachineFun
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
     MachineInstr *New = nullptr;
-    if (Opcode == TII.getCallFrameSetupOpcode()) {
-      // Try to convert movs to the stack into pushes.
-      // We currently only look for a pattern that appears in 32-bit
-      // calling conventions.
-      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
-        return;
 
-      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
-                    StackPtr)
-        .addReg(StackPtr)
-        .addImm(Amount);
-    } else {
-      assert(Opcode == TII.getCallFrameDestroyOpcode());
-
-      // Factor out the amount the callee already popped.
-      Amount -= CalleeAmt;
+    // Factor out the amount that gets handled inside the sequence
+    // (Pushes of argument for frame setup, callee pops for frame destroy)
+    Amount -= InternalAmt;
+
+    if (Amount) {
+      if (Opcode == TII.getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      } else {
+        assert(Opcode == TII.getCallFrameDestroyOpcode());
 
-      if (Amount) {
         unsigned Opc = getADDriOpcode(IsLP64, Amount);
         New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
           .addReg(StackPtr).addImm(Amount);
@@ -2080,13 +1997,13 @@ eliminateCallFramePseudoInstr(MachineFun
     return;
   }
 
-  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+  if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.  We do this until we have
     // more advanced stack pointer tracking ability.
-    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+    unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
     MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
-      .addReg(StackPtr).addImm(CalleeAmt);
+      .addReg(StackPtr).addImm(InternalAmt);
 
     // The EFLAGS implicit def is dead.
     New->getOperand(3).setIsDead();

Modified: llvm/trunk/lib/Target/X86/X86FrameLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FrameLowering.h?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86FrameLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.h Sun Feb  1 10:56:04 2015
@@ -66,6 +66,8 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,

Modified: llvm/trunk/lib/Target/X86/X86InstrCompiler.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrCompiler.td?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrCompiler.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td Sun Feb  1 10:56:04 2015
@@ -43,15 +43,18 @@ let hasSideEffects = 0, isNotDuplicable
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKDOWN",
-                           [(X86callseq_start timm:$amt)]>,
+                           []>,
                           Requires<[NotLP64]>;
 def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[NotLP64]>;
 }
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
@@ -59,16 +62,17 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (o
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKDOWN",
-                           [(X86callseq_start timm:$amt)]>,
+                           []>,
                           Requires<[IsLP64]>;
 def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[IsLP64]>;
 }
-
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
 
 
 // x86-64 va_start lowering magic.

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Sun Feb  1 10:56:04 2015
@@ -1804,6 +1804,58 @@ X86InstrInfo::isCoalescableExtInstr(cons
   return false;
 }
 
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+  if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+      MI->getOpcode() == getCallFrameDestroyOpcode()) {
+    unsigned StackAlign = TFI->getStackAlignment();
+    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * 
+                 StackAlign;
+
+    SPAdj -= MI->getOperand(1).getImm();
+
+    if (MI->getOpcode() == getCallFrameSetupOpcode())
+      return SPAdj;
+    else
+      return -SPAdj;
+  }
+  
+  // To know whether a call adjusts the stack, we need information 
+  // that is bound to the following ADJCALLSTACKUP pseudo.
+  // Look for the next ADJCALLSTACKUP that follows the call.
+  if (MI->isCall()) {
+    const MachineBasicBlock* MBB = MI->getParent();
+    auto I = ++MachineBasicBlock::const_iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+          I->isCall())
+        break;
+    }
+
+    // If we could not find a frame destroy opcode, then it has already
+    // been simplified, so we don't care.
+    if (I->getOpcode() != getCallFrameDestroyOpcode())
+      return 0;
+
+    return -(I->getOperand(1).getImm());
+  }
+
+  // Currently handle only PUSHes we can reasonably expect to see
+  // in call sequences
+  switch (MI->getOpcode()) {
+  default: 
+    return 0;
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+    return 4;
+  }
+}
+
 /// isFrameOperand - Return true and the FrameIndex if the specified
 /// operand and follow operands form a reference to the stack frame.
 bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Sun Feb  1 10:56:04 2015
@@ -175,6 +175,11 @@ public:
   ///
   const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
+  /// getSPAdjust - This returns the stack pointer adjustment made by
+  /// this instruction. For x86, we need to handle more complex call
+  /// sequences involving PUSHes.
+  int getSPAdjust(const MachineInstr *MI) const override;
+
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
   /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns

Modified: llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h Sun Feb  1 10:56:04 2015
@@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public Ma
   unsigned ArgumentStackSize;
   /// NumLocalDynamics - Number of local-dynamic TLS accesses.
   unsigned NumLocalDynamics;
+  /// HasPushSequences - Keeps track of whether this function uses sequences
+  /// of pushes to pass function parameters.
+  bool HasPushSequences;
 
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
@@ -97,7 +100,8 @@ public:
                              VarArgsGPOffset(0),
                              VarArgsFPOffset(0),
                              ArgumentStackSize(0),
-                             NumLocalDynamics(0) {}
+                             NumLocalDynamics(0),
+                             HasPushSequences(false) {}
 
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
@@ -113,11 +117,15 @@ public:
       VarArgsGPOffset(0),
       VarArgsFPOffset(0),
       ArgumentStackSize(0),
-      NumLocalDynamics(0) {}
+      NumLocalDynamics(0),
+      HasPushSequences(false) {}
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
 
+  bool getHasPushSequences() const { return HasPushSequences; }
+  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
   bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
   void setRestoreBasePointer(const MachineFunction *MF);
   int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }

Modified: llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86RegisterInfo.cpp Sun Feb  1 10:56:04 2015
@@ -468,8 +468,6 @@ void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, unsigned FIOperandNum,
                                      RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
@@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(Mac
   } else
     FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
 
+  if (BasePtr == StackPtr)
+    FIOffset += SPAdj;
+
   // The frame index format for stackmaps and patchpoints is different from the
   // X86 format. It only has a FI and an offset.
   if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {

Modified: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetMachine.cpp?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp Sun Feb  1 10:56:04 2015
@@ -193,6 +193,7 @@ public:
   void addIRPasses() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
+  void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreEmitPass() override;
 };
@@ -226,6 +227,10 @@ bool X86PassConfig::addILPOpts() {
   return true;
 }
 
+void X86PassConfig::addPreRegAlloc() {
+  addPass(createX86CallFrameOptimization());
+}
+
 void X86PassConfig::addPostRegAlloc() {
   addPass(createX86FloatingPointStackifierPass());
 }

Modified: llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll (original)
+++ llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll Sun Feb  1 10:56:04 2015
@@ -31,7 +31,7 @@ blah:
           to label %invoke.cont unwind label %lpad
 
 ;  Uses end as sret param.
-; CHECK:  movl %[[end]], (%esp)
+; CHECK:  pushl %[[end]]
 ; CHECK:  calll _plus
 
 invoke.cont:

Modified: llvm/trunk/test/CodeGen/X86/movtopush.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movtopush.ll?rev=227752&r1=227751&r2=227752&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/movtopush.ll (original)
+++ llvm/trunk/test/CodeGen/X86/movtopush.ll Sun Feb  1 10:56:04 2015
@@ -1,24 +1,65 @@
 ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
 ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
+
 declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
 declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
 
 ; Here, we should have a reserved frame, so we don't expect pushes
-; NORMAL-LABEL: test1
+; NORMAL-LABEL: test1:
 ; NORMAL: subl    $16, %esp
 ; NORMAL-NEXT: movl    $4, 12(%esp)
 ; NORMAL-NEXT: movl    $3, 8(%esp)
 ; NORMAL-NEXT: movl    $2, 4(%esp)
 ; NORMAL-NEXT: movl    $1, (%esp)
 ; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
 define void @test1() {
 entry:
   call void @good(i32 1, i32 2, i32 3, i32 4)
   ret void
 }
 
-; Here, we expect a sequence of 4 immediate pushes
-; NORMAL-LABEL: test2
+; We're optimizing for code size, so we should get pushes for x86,
+; even though there is a reserved call frame.
+; Make sure we don't touch x86-64
+; NORMAL-LABEL: test1b:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+; X64-LABEL: test1b:
+; X64: movl    $1, %ecx
+; X64-NEXT: movl    $2, %edx
+; X64-NEXT: movl    $3, %r8d
+; X64-NEXT: movl    $4, %r9d
+; X64-NEXT: callq   good
+define void @test1b() optsize {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Same as above, but for minsize
+; NORMAL-LABEL: test1c:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test1c() minsize {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; If we have a reserved frame, we should have pushes
+; NORMAL-LABEL: test2:
 ; NORMAL-NOT: subl {{.*}} %esp
 ; NORMAL: pushl   $4
 ; NORMAL-NEXT: pushl   $3
@@ -34,53 +75,53 @@ entry:
 
 ; Again, we expect a sequence of 4 immediate pushes
 ; Checks that we generate the right pushes for >8bit immediates
-; NORMAL-LABEL: test2b
+; NORMAL-LABEL: test2b:
 ; NORMAL-NOT: subl {{.*}} %esp
 ; NORMAL: pushl   $4096
 ; NORMAL-NEXT: pushl   $3072
 ; NORMAL-NEXT: pushl   $2048
 ; NORMAL-NEXT: pushl   $1024
 ; NORMAL-NEXT: call
-define void @test2b(i32 %k) {
+; NORMAL-NEXT: addl $16, %esp
+define void @test2b() optsize {
 entry:
-  %a = alloca i32, i32 %k
   call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
   ret void
 }
 
 ; The first push should push a register
-; NORMAL-LABEL: test3
+; NORMAL-LABEL: test3:
 ; NORMAL-NOT: subl {{.*}} %esp
 ; NORMAL: pushl   $4
 ; NORMAL-NEXT: pushl   $3
 ; NORMAL-NEXT: pushl   $2
 ; NORMAL-NEXT: pushl   %e{{..}}
 ; NORMAL-NEXT: call
-define void @test3(i32 %k) {
+; NORMAL-NEXT: addl $16, %esp
+define void @test3(i32 %k) optsize {
 entry:
-  %a = alloca i32, i32 %k
   call void @good(i32 %k, i32 2, i32 3, i32 4)
   ret void
 }
 
 ; We don't support weird calling conventions
-; NORMAL-LABEL: test4
+; NORMAL-LABEL: test4:
 ; NORMAL: subl    $12, %esp
 ; NORMAL-NEXT: movl    $4, 8(%esp)
 ; NORMAL-NEXT: movl    $3, 4(%esp)
 ; NORMAL-NEXT: movl    $1, (%esp)
 ; NORMAL-NEXT: movl    $2, %eax
 ; NORMAL-NEXT: call
-define void @test4(i32 %k) {
+; NORMAL-NEXT: addl $12, %esp
+define void @test4() optsize {
 entry:
-  %a = alloca i32, i32 %k
   call void @inreg(i32 1, i32 2, i32 3, i32 4)
   ret void
 }
 
-; Check that additional alignment is added when the pushes
-; don't add up to the required alignment.
-; ALIGNED-LABEL: test5
+; When there is no reserved call frame, check that additional alignment
+; is added when the pushes don't add up to the required alignment.
+; ALIGNED-LABEL: test5:
 ; ALIGNED: subl    $16, %esp
 ; ALIGNED-NEXT: pushl   $4
 ; ALIGNED-NEXT: pushl   $3
@@ -97,7 +138,7 @@ entry:
 ; Check that pushing the addresses of globals (Or generally, things that 
 ; aren't exactly immediates) isn't broken.
 ; Fixes PR21878.
-; NORMAL-LABEL: test6
+; NORMAL-LABEL: test6:
 ; NORMAL: pushl    $_ext
 ; NORMAL-NEXT: call
 declare void @f(i8*)
@@ -110,3 +151,108 @@ bb:
   alloca i32
   ret void
 }
+
+; Check that we fold simple cases into the push
+; NORMAL-LABEL: test7:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: movl 4(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: pushl   $4
+; NORMAL-NEXT: pushl   ([[EAX]])
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test7(i32* %ptr) optsize {
+entry:
+  %val = load i32* %ptr
+  call void @good(i32 1, i32 2, i32 %val, i32 4)
+  ret void
+}
+
+; But we don't want to fold stack-relative loads into the push,
+; because the offset will be wrong
+; NORMAL-LABEL: test8:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: movl 4(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: pushl   $4
+; NORMAL-NEXT: pushl   [[EAX]]
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test8(i32* %ptr) optsize {
+entry:
+  %val = ptrtoint i32* %ptr to i32
+  call void @good(i32 1, i32 2, i32 %val, i32 4)
+  ret void
+}
+
+; If one function is using push instructions, and the other isn't
+; (because it has frame-index references), then we must resolve
+; these references correctly.
+; NORMAL-LABEL: test9:
+; NORMAL-NOT: leal (%esp), 
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: subl $16, %esp
+; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: movl    [[EAX]], 12(%esp)
+; NORMAL-NEXT: movl    $7, 8(%esp)
+; NORMAL-NEXT: movl    $6, 4(%esp)
+; NORMAL-NEXT: movl    $5, (%esp)
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test9() optsize {
+entry:
+  %p = alloca i32, align 4
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  %0 = ptrtoint i32* %p to i32
+  call void @good(i32 5, i32 6, i32 7, i32 %0)
+  ret void
+}
+
+; We can end up with an indirect call which gets reloaded on the spot.
+; Make sure we reference the correct stack slot - we spill into (%esp)
+; and reload from 16(%esp) due to the pushes.
+; NORMAL-LABEL: test10:
+; NORMAL: movl $_good, [[ALLOC:.*]]
+; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]]
+; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill
+; NORMAL: nop
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: calll *16(%esp)
+; NORMAL-NEXT: addl $16, %esp
+define void @test10() optsize {
+  %stack_fptr = alloca void (i32, i32, i32, i32)*
+  store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
+  %good_ptr = load volatile void (i32, i32, i32, i32)** %stack_fptr
+  call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
+  call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; We can't fold the load from the global into the push because of 
+; interference from the store
+; NORMAL-LABEL: test11:
+; NORMAL: movl    _the_global, [[EAX:%e..]]
+; NORMAL-NEXT: movl    $42, _the_global
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl [[EAX]]
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+ at the_global = external global i32
+define void @test11() optsize {
+  %myload = load i32* @the_global
+  store i32 42, i32* @the_global
+  call void @good(i32 %myload, i32 2, i32 3, i32 4)
+  ret void
+}