[llvm] r214580 - [X86] Simplify X87 stackifier pass.

Fri Aug 1 15:19:41 PDT 2014

Author: ahatanak
Date: Fri Aug  1 17:19:41 2014
New Revision: 214580

URL: http://llvm.org/viewvc/llvm-project?rev=214580&view=rev
Log:
[X86] Simplify X87 stackifier pass.

Stop using ST registers for function returns and inline-asm instructions and use
FP registers instead. This allows removing a large amount of code in the
stackifier pass that was needed to track register liveness and handle copies
between ST and FP registers and function calls returning floating point values.

It also fixes a bug which manifests when an ST register defined by an
inline-asm instruction was live across another inline-asm instruction, as shown
in the following sequence of machine instructions:

1. INLINEASM <es:frndint> $0:[regdef], %ST0<imp-def,tied5>
2. INLINEASM <es:fldcw $0>
3. %FP0<def> = COPY %ST0

<rdar://problem/16952634>

Modified:
    llvm/trunk/lib/Target/X86/X86CallingConv.td
    llvm/trunk/lib/Target/X86/X86FastISel.cpp
    llvm/trunk/lib/Target/X86/X86FloatingPoint.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrCompiler.td
    llvm/trunk/lib/Target/X86/X86InstrFPStack.td
    llvm/trunk/lib/Target/X86/X86RegisterInfo.td
    llvm/trunk/test/CodeGen/X86/inline-asm-fpstack.ll

Modified: llvm/trunk/lib/Target/X86/X86CallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallingConv.td?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86CallingConv.td (original)
+++ llvm/trunk/lib/Target/X86/X86CallingConv.td Fri Aug  1 17:19:41 2014
@@ -59,20 +59,20 @@ def RetCC_X86Common : CallingConv<[
   // MM0, it doesn't support these vector types.
   CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
 
-  // Long double types are always returned in ST0 (even with SSE).
-  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+  // Long double types are always returned in FP0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
 ]>;
 
 // X86-32 C return-value convention.
 def RetCC_X86_32_C : CallingConv<[
-  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // The X86-32 calling convention returns FP values in FP0, unless marked
   // with "inreg" (used here to distinguish one kind of reg from another,
   // weirdly; this is really the sse-regparm calling convention) in which
   // case they use XMM0, otherwise it is the same as the common X86 calling
   // conv.
   CCIfInReg<CCIfSubtarget<"hasSSE2()",
     CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
-  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 

Modified: llvm/trunk/lib/Target/X86/X86FastISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FastISel.cpp?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86FastISel.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FastISel.cpp Fri Aug  1 17:19:41 2014
@@ -1017,7 +1017,7 @@ bool X86FastISel::X86SelectRet(const Ins
 
     // The calling-convention tables for x87 returns don't tell
     // the whole story.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
@@ -2989,39 +2989,33 @@ bool X86FastISel::FastLowerCall(CallLowe
       report_fatal_error("SSE register return with SSE disabled");
     }
 
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a COPY is not good enough - the copy instruction may be eliminated if the
-    // return value is not used. We use the FpPOP_RETVAL instruction instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) {
-        CopyVT = MVT::f80;
-        CopyReg = createResultReg(&X86::RFP80RegClass);
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::FpPOP_RETVAL), CopyReg);
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+      CopyReg = createResultReg(&X86::RFP80RegClass);
+    }
 
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register. This is accomplished by storing the f80 value in memory
-      // and then loading it back.
-      if (CopyVT != VA.getValVT()) {
-        EVT ResVT = VA.getValVT();
-        unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
-        unsigned MemSize = ResVT.getSizeInBits()/8;
-        int FI = MFI.CreateStackObject(MemSize, MemSize, false);
-        addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                  TII.get(Opc)), FI)
-          .addReg(CopyReg);
-        Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
-        addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                  TII.get(Opc), ResultReg + i), FI);
-      }
-    } else {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
-      InRegs.push_back(VA.getLocReg());
+    // Copy out the result.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+    InRegs.push_back(VA.getLocReg());
+
+    // Round the f80 to the right size, which also moves it to the appropriate
+    // xmm register. This is accomplished by storing the f80 value in memory
+    // and then loading it back.
+    if (CopyVT != VA.getValVT()) {
+      EVT ResVT = VA.getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc)), FI)
+        .addReg(CopyReg);
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc), ResultReg + i), FI);
     }
   }
 

Modified: llvm/trunk/lib/Target/X86/X86FloatingPoint.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FloatingPoint.cpp?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86FloatingPoint.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FloatingPoint.cpp Fri Aug  1 17:19:41 2014
@@ -25,15 +25,18 @@
 
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/Debug.h"
@@ -50,6 +53,8 @@ STATISTIC(NumFXCH, "Number of fxch instr
 STATISTIC(NumFP  , "Number of floating point instructions");
 
 namespace {
+  const unsigned ScratchFPReg = 7;
+
   struct FPS : public MachineFunctionPass {
     static char ID;
     FPS() : MachineFunctionPass(ID) {
@@ -137,7 +142,7 @@ namespace {
     unsigned StackTop;          // The current top of the FP stack.
 
     enum {
-      NumFPRegs = 16            // Including scratch pseudo-registers.
+      NumFPRegs = 8             // Including scratch pseudo-registers.
     };
 
     // For each live FP<n> register, point to its Stack[] entry.
@@ -146,27 +151,6 @@ namespace {
     // register allocator thinks.
     unsigned RegMap[NumFPRegs];
 
-    // Pending fixed registers - Inline assembly needs FP registers to appear
-    // in fixed stack slot positions. This is handled by copying FP registers
-    // to ST registers before the instruction, and copying back after the
-    // instruction.
-    //
-    // This is modeled with pending ST registers. NumPendingSTs is the number
-    // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
-    // register that holds the ST value. The ST registers are not moved into
-    // place until immediately before the instruction that needs them.
-    //
-    // It can happen that we need an ST register to be live when no FP register
-    // holds the value:
-    //
-    //   %ST0 = COPY %FP4<kill>
-    //
-    // When that happens, we allocate a scratch FP register to hold the ST
-    // value. That means every register in PendingST must be live.
-
-    unsigned NumPendingSTs;
-    unsigned char PendingST[8];
-
     // Set up our stack model to match the incoming registers to MBB.
     void setupBlockStack();
 
@@ -180,9 +164,6 @@ namespace {
         dbgs() << " FP" << Stack[i];
         assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
       }
-      for (unsigned i = 0; i != NumPendingSTs; ++i)
-        dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
-      dbgs() << "\n";
     }
 #endif
 
@@ -199,19 +180,6 @@ namespace {
       return Slot < StackTop && Stack[Slot] == RegNo;
     }
 
-    /// getScratchReg - Return an FP register that is not currently in use.
-    unsigned getScratchReg() const {
-      for (int i = NumFPRegs - 1; i >= 8; --i)
-        if (!isLive(i))
-          return i;
-      llvm_unreachable("Ran out of scratch FP registers");
-    }
-
-    /// isScratchReg - Returns trus if RegNo is a scratch FP register.
-    static bool isScratchReg(unsigned RegNo) {
-      return RegNo > 8 && RegNo < NumFPRegs;
-    }
-
     /// getStackEntry - Return the X86::FP<n> register in register ST(i).
     unsigned getStackEntry(unsigned STi) const {
       if (STi >= StackTop)
@@ -263,21 +231,6 @@ namespace {
       BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
     }
 
-    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
-    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
-    /// them to new scratch registers.
-    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
-      for (unsigned i = 0; i != NumPendingSTs; ++i) {
-        if (PendingST[i] != RegNo)
-          continue;
-        unsigned SR = getScratchReg();
-        DEBUG(dbgs() << "Duplicating pending ST" << i
-                     << " in FP" << RegNo << " to FP" << SR << '\n');
-        duplicateToTop(RegNo, SR, I);
-        PendingST[i] = SR;
-      }
-    }
-
     /// popStackAfter - Pop the current value off of the top of the FP stack
     /// after the specified instruction.
     void popStackAfter(MachineBasicBlock::iterator &I);
@@ -304,6 +257,7 @@ namespace {
 
     bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
 
+    void handleCall(MachineBasicBlock::iterator &I);
     void handleZeroArgFP(MachineBasicBlock::iterator &I);
     void handleOneArgFP(MachineBasicBlock::iterator &I);
     void handleOneArgFPRW(MachineBasicBlock::iterator &I);
@@ -320,6 +274,8 @@ namespace {
       return X86::RFP80RegClass.contains(DstReg) ||
         X86::RFP80RegClass.contains(SrcReg);
     }
+
+    void setKillFlags(MachineBasicBlock &MBB) const;
   };
   char FPS::ID = 0;
 }
@@ -409,8 +365,8 @@ void FPS::bundleCFG(MachineFunction &MF)
 bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
   bool Changed = false;
   MBB = &BB;
-  NumPendingSTs = 0;
 
+  setKillFlags(BB);
   setupBlockStack();
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
@@ -428,6 +384,9 @@ bool FPS::processBasicBlock(MachineFunct
         X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
       FPInstClass = X86II::SpecialFP;
 
+    if (MI->isCall())
+      FPInstClass = X86II::SpecialFP;
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
@@ -462,7 +421,9 @@ bool FPS::processBasicBlock(MachineFunct
     // after definition.  If so, pop them.
     for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
       unsigned Reg = DeadRegs[i];
-      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+      // Check if Reg is live on the stack. An inline-asm register operand that
+      // is in the clobber list and marked dead might not be live on the stack.
+      if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
         DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
         freeStackSlotAfter(I, Reg-X86::FP0);
       }
@@ -966,6 +927,31 @@ void FPS::shuffleStackTop(const unsigned
 // Instruction transformation implementation
 //===----------------------------------------------------------------------===//
 
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+  unsigned STReturns = 0;
+
+  for (const auto &MO : I->operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned R = MO.getReg() - X86::FP0;
+
+    if (R < 8) {
+      assert(MO.isDef() && MO.isImplicit());
+      STReturns |= 1 << R;
+    }
+  }
+
+  unsigned N = CountTrailingOnes_32(STReturns);
+
+  // FP registers used for function return must be consecutive starting at
+  // FP0.
+  assert(STReturns == 0 || isMask_32(STReturns) && N <= 2);
+
+  for (unsigned I = 0; I < N; ++I)
+    pushReg(N - I - 1);
+}
+
 /// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
 ///
 void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
@@ -992,9 +978,6 @@ void FPS::handleOneArgFP(MachineBasicBlo
   unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
-  if (KillsSrc)
-    duplicatePendingSTBeforeKill(Reg, I);
-
   // FISTP64m is strange because there isn't a non-popping versions.
   // If we have one _and_ we don't want to pop the operand, duplicate the value
   // on the stack instead of moving it.  This ensure that popping the value is
@@ -1015,7 +998,7 @@ void FPS::handleOneArgFP(MachineBasicBlo
        MI->getOpcode() == X86::ISTT_Fp32m80 ||
        MI->getOpcode() == X86::ISTT_Fp64m80 ||
        MI->getOpcode() == X86::ST_FpP80m)) {
-    duplicateToTop(Reg, getScratchReg(), I);
+    duplicateToTop(Reg, ScratchFPReg, I);
   } else {
     moveToTop(Reg, I);            // Move to the top of the stack...
   }
@@ -1058,7 +1041,6 @@ void FPS::handleOneArgFPRW(MachineBasicB
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
   if (KillsSrc) {
-    duplicatePendingSTBeforeKill(Reg, I);
     // If this is the last use of the source register, just make sure it's on
     // the top of the stack.
     moveToTop(Reg, I);
@@ -1316,69 +1298,20 @@ void FPS::handleCondMovFP(MachineBasicBl
 ///
 void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   MachineInstr *MI = I;
+
+  if (MI->isCall()) {
+    handleCall(I);
+    return;
+  }
+
   switch (MI->getOpcode()) {
   default: llvm_unreachable("Unknown SpecialFP instruction!");
   case TargetOpcode::COPY: {
     // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
     const MachineOperand &MO1 = MI->getOperand(1);
     const MachineOperand &MO0 = MI->getOperand(0);
-    unsigned DstST = MO0.getReg() - X86::ST0;
-    unsigned SrcST = MO1.getReg() - X86::ST0;
     bool KillsSrc = MI->killsRegister(MO1.getReg());
 
-    // ST = COPY FP. Set up a pending ST register.
-    if (DstST < 8) {
-      unsigned SrcFP = getFPReg(MO1);
-      assert(isLive(SrcFP) && "Cannot copy dead register");
-      assert(!MO0.isDead() && "Cannot copy to dead ST register");
-
-      // Unallocated STs are marked as the nonexistent FP255.
-      while (NumPendingSTs <= DstST)
-        PendingST[NumPendingSTs++] = NumFPRegs;
-
-      // STi could still be live from a previous inline asm.
-      if (isScratchReg(PendingST[DstST])) {
-        DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
-                     << '\n');
-        freeStackSlotBefore(MI, PendingST[DstST]);
-      }
-
-      // When the source is killed, allocate a scratch FP register.
-      if (KillsSrc) {
-        duplicatePendingSTBeforeKill(SrcFP, I);
-        unsigned Slot = getSlot(SrcFP);
-        unsigned SR = getScratchReg();
-        PendingST[DstST] = SR;
-        Stack[Slot] = SR;
-        RegMap[SR] = Slot;
-      } else
-        PendingST[DstST] = SrcFP;
-      break;
-    }
-
-    // FP = COPY ST. Extract fixed stack value.
-    // Any instruction defining ST registers must have assigned them to a
-    // scratch register.
-    if (SrcST < 8) {
-      unsigned DstFP = getFPReg(MO0);
-      assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
-      assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
-      unsigned SrcFP = PendingST[SrcST];
-      assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
-      assert(isLive(SrcFP) && "Scratch holding ST is dead");
-
-      // DstFP steals the stack slot from SrcFP.
-      unsigned Slot = getSlot(SrcFP);
-      Stack[Slot] = DstFP;
-      RegMap[DstFP] = Slot;
-
-      // Always treat the ST as killed.
-      PendingST[SrcST] = NumFPRegs;
-      while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
-        --NumPendingSTs;
-      break;
-    }
-
     // FP <- FP copy.
     unsigned DstFP = getFPReg(MO0);
     unsigned SrcFP = getFPReg(MO1);
@@ -1406,36 +1339,6 @@ void FPS::handleSpecialFP(MachineBasicBl
     break;
   }
 
-  case X86::FpPOP_RETVAL: {
-    // The FpPOP_RETVAL instruction is used after calls that return a value on
-    // the floating point stack. We cannot model this with ST defs since CALL
-    // instructions have fixed clobber lists. This instruction is interpreted
-    // to mean that there is one more live register on the stack than we
-    // thought.
-    //
-    // This means that StackTop does not match the hardware stack between a
-    // call and the FpPOP_RETVAL instructions.  We do tolerate FP instructions
-    // between CALL and FpPOP_RETVAL as long as they don't overflow the
-    // hardware stack.
-    unsigned DstFP = getFPReg(MI->getOperand(0));
-
-    // Move existing stack elements up to reflect reality.
-    assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
-    if (StackTop) {
-      std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
-      for (unsigned i = 0; i != NumFPRegs; ++i)
-        ++RegMap[i];
-    }
-    ++StackTop;
-
-    // DstFP is the new bottom of the stack.
-    Stack[0] = DstFP;
-    RegMap[DstFP] = 0;
-
-    // DstFP will be killed by processBasicBlock if this was a dead def.
-    break;
-  }
-
   case TargetOpcode::INLINEASM: {
     // The inline asm MachineInstr currently only *uses* FP registers for the
     // 'f' constraint.  These should be turned into the current ST(x) register
@@ -1472,19 +1375,30 @@ void FPS::handleSpecialFP(MachineBasicBl
     // only tell clobbers from defs by looking at the asm descriptor.
     unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
     unsigned NumOps = 0;
+    SmallSet<unsigned, 1> FRegIdx;
+    unsigned RCID;
+
     for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
          i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
       unsigned Flags = MI->getOperand(i).getImm();
+
       NumOps = InlineAsm::getNumOperandRegisters(Flags);
       if (NumOps != 1)
         continue;
       const MachineOperand &MO = MI->getOperand(i + 1);
       if (!MO.isReg())
         continue;
-      unsigned STReg = MO.getReg() - X86::ST0;
+      unsigned STReg = MO.getReg() - X86::FP0;
       if (STReg >= 8)
         continue;
 
+      // If the flag has a register class constraint, this must be an operand
+      // with constraint "f". Record its index and continue.
+      if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+        FRegIdx.insert(i + 1);
+        continue;
+      }
+
       switch (InlineAsm::getKind(Flags)) {
       case InlineAsm::Kind_RegUse:
         STUses |= (1u << STReg);
@@ -1527,71 +1441,42 @@ void FPS::handleSpecialFP(MachineBasicBl
     DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
                  << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
 
-    // Scan the instruction for FP uses corresponding to "f" constraints.
-    // Collect FP registers to kill afer the instruction.
-    // Always kill all the scratch regs.
+#ifndef NDEBUG
+    // If any input operand uses constraint "f", all output register
+    // constraints must be early-clobber defs.
+    for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+      if (FRegIdx.count(I)) {
+        assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+               "Operands with constraint \"f\" cannot overlap with defs");
+      }
+#endif
+
+    // Collect all FP registers (register operands with constraints "t", "u",
+    // and "f") to kill afer the instruction.
     unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
-    unsigned FPUsed = 0;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &Op = MI->getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
-      if (!Op.isUse())
-        MI->emitError("illegal \"f\" output constraint");
       unsigned FPReg = getFPReg(Op);
-      FPUsed |= 1U << FPReg;
 
       // If we kill this operand, make sure to pop it from the stack after the
       // asm.  We just remember it for now, and pop them all off at the end in
       // a batch.
-      if (Op.isKill())
+      if (Op.isUse() && Op.isKill())
         FPKills |= 1U << FPReg;
     }
 
-    // The popped inputs will be killed by the instruction, so duplicate them
-    // if the FP register needs to be live after the instruction, or if it is
-    // used in the instruction itself. We effectively treat the popped inputs
-    // as early clobbers.
-    for (unsigned i = 0; i < NumSTPopped; ++i) {
-      if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
-        continue;
-      unsigned SR = getScratchReg();
-      duplicateToTop(PendingST[i], SR, I);
-      DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
-                   << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
-      PendingST[i] = SR;
-    }
-
-    // Make sure we have a unique live register for every fixed use. Some of
-    // them could be undef uses, and we need to emit LD_F0 instructions.
-    for (unsigned i = 0; i < NumSTUses; ++i) {
-      if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
-        // Check for shared assignments.
-        for (unsigned j = 0; j < i; ++j) {
-          if (PendingST[j] != PendingST[i])
-            continue;
-          // STi and STj are inn the same register, create a copy.
-          unsigned SR = getScratchReg();
-          duplicateToTop(PendingST[i], SR, I);
-          DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
-                       << unsigned(PendingST[i])
-                       << " to avoid collision with ST" << j << '\n');
-          PendingST[i] = SR;
-        }
-        continue;
-      }
-      unsigned SR = getScratchReg();
-      DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
-      BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
-      pushReg(SR);
-      PendingST[i] = SR;
-      if (NumPendingSTs == i)
-        ++NumPendingSTs;
-    }
-    assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+    // Do not include registers that are implicitly popped by defs/clobbers.
+    FPKills &= ~(STDefs | STClobbers);
 
     // Now we can rearrange the live registers to match what was requested.
-    shuffleStackTop(PendingST, NumPendingSTs, I);
+    unsigned char STUsesArray[8];
+
+    for (unsigned I = 0; I < NumSTUses; ++I)
+      STUsesArray[I] = I;
+
+    shuffleStackTop(STUsesArray, NumSTUses, I);
     DEBUG({dbgs() << "Before asm: "; dumpStack();});
 
     // With the stack layout fixed, rewrite the FP registers.
@@ -1599,36 +1484,22 @@ void FPS::handleSpecialFP(MachineBasicBl
       MachineOperand &Op = MI->getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
+
       unsigned FPReg = getFPReg(Op);
-      Op.setReg(getSTReg(FPReg));
+
+      if (FRegIdx.count(i))
+        // Operand with constraint "f".
+        Op.setReg(getSTReg(FPReg));
+      else
+        // Operand with a single register class constraint ("t" or "u").
+        Op.setReg(X86::ST0 + FPReg);
     }
 
     // Simulate the inline asm popping its inputs and pushing its outputs.
     StackTop -= NumSTPopped;
 
-    // Hold the fixed output registers in scratch FP registers. They will be
-    // transferred to real FP registers by copies.
-    NumPendingSTs = 0;
-    for (unsigned i = 0; i < NumSTDefs; ++i) {
-      unsigned SR = getScratchReg();
-      pushReg(SR);
-      FPKills &= ~(1u << SR);
-    }
     for (unsigned i = 0; i < NumSTDefs; ++i)
-      PendingST[NumPendingSTs++] = getStackEntry(i);
-    DEBUG({dbgs() << "After asm: "; dumpStack();});
-
-    // If any of the ST defs were dead, pop them immediately. Our caller only
-    // handles dead FP defs.
-    MachineBasicBlock::iterator InsertPt = MI;
-    for (unsigned i = 0; STDefs & (1u << i); ++i) {
-      if (!(STDeadDefs & (1u << i)))
-        continue;
-      freeStackSlotAfter(InsertPt, PendingST[i]);
-      PendingST[i] = NumFPRegs;
-    }
-    while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
-      --NumPendingSTs;
+      pushReg(NumSTDefs - i - 1);
 
     // If this asm kills any FP registers (is the last use of them) we must
     // explicitly emit pop instructions for them.  Do this now after the asm has
@@ -1640,9 +1511,10 @@ void FPS::handleSpecialFP(MachineBasicBl
     while (FPKills) {
       unsigned FPReg = countTrailingZeros(FPKills);
       if (isLive(FPReg))
-        freeStackSlotAfter(InsertPt, FPReg);
+        freeStackSlotAfter(I, FPReg);
       FPKills &= ~(1U << FPReg);
     }
+
     // Don't delete the inline asm!
     return;
   }
@@ -1738,7 +1610,7 @@ void FPS::handleSpecialFP(MachineBasicBl
 
       // Duplicate the TOS so that we return it twice.  Just pick some other FPx
       // register to hold it.
-      unsigned NewReg = getScratchReg();
+      unsigned NewReg = ScratchFPReg;
       duplicateToTop(FirstFPRegOp, NewReg, MI);
       FirstFPRegOp = NewReg;
     }
@@ -1771,3 +1643,41 @@ void FPS::handleSpecialFP(MachineBasicBl
   } else
     --I;
 }
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+  const TargetRegisterInfo *TRI = MBB.getParent()->getTarget()
+    .getRegisterInfo();
+  LivePhysRegs LPR(TRI);
+
+  LPR.addLiveOuts(&MBB);
+
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E; ++I) {
+    BitVector Defs(8);
+    SmallVector<MachineOperand *, 2> Uses;
+    MachineInstr &MI = *I;
+
+    for (auto &MO : I->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      unsigned Reg = MO.getReg() - X86::FP0;
+
+      if (Reg >= 8)
+        continue;
+
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        if (!LPR.contains(MO.getReg()))
+          MO.setIsDead();
+      } else
+        Uses.push_back(&MO);
+    }
+
+    for (auto *MO : Uses)
+      if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+        MO->setIsKill();
+
+    LPR.stepBackward(MI);
+  }
+}

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Aug  1 17:19:41 2014
@@ -1968,8 +1968,8 @@ X86TargetLowering::LowerReturn(SDValue C
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
-    if (VA.getLocReg() == X86::ST0 ||
-        VA.getLocReg() == X86::ST1) {
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
       // If this is a copy from an xmm register to ST(0), use an FPExtend to
       // change the value to the FP stack register class.
       if (isScalarFPTypeInSSEReg(VA.getValVT()))
@@ -2107,33 +2107,21 @@ X86TargetLowering::LowerCallResult(SDVal
       report_fatal_error("SSE register return with SSE disabled");
     }
 
-    SDValue Val;
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT()))
+      CopyVT = MVT::f80;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               CopyVT, InFlag).getValue(1);
+    SDValue Val = Chain.getValue(0);
+
+    if (CopyVT != VA.getValVT())
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
 
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a CopyFromReg is not good enough - the copy instruction may be eliminated
-    // if the return value is not used. We use the FpPOP_RETVAL instruction
-    // instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
-      SDValue Ops[] = { Chain, InFlag };
-      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
-                                         MVT::Other, MVT::Glue, Ops), 1);
-      Val = Chain.getValue(0);
-
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register.
-      if (CopyVT != VA.getValVT())
-        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                          // This truncation won't change the value.
-                          DAG.getIntPtrConstant(1));
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                 CopyVT, InFlag).getValue(1);
-      Val = Chain.getValue(0);
-    }
     InFlag = Chain.getValue(2);
     InVals.push_back(Val);
   }
@@ -3291,7 +3279,7 @@ X86TargetLowering::IsEligibleForTailCall
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
-      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
         return false;
     }
   }
@@ -23078,14 +23066,14 @@ X86TargetLowering::getRegForInlineAsmCon
         Constraint[5] == ')' &&
         Constraint[6] == '}') {
 
-      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.first = X86::FP0+Constraint[4]-'0';
       Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // GCC allows "st(0)" to be called just plain "st".
     if (StringRef("{st}").equals_lower(Constraint)) {
-      Res.first = X86::ST0;
+      Res.first = X86::FP0;
       Res.second = &X86::RFP80RegClass;
       return Res;
     }

Modified: llvm/trunk/lib/Target/X86/X86InstrCompiler.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrCompiler.td?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrCompiler.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td Fri Aug  1 17:19:41 2014
@@ -407,7 +407,8 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1
 // All calls clobber the non-callee saved registers. ESP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
-let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
@@ -426,7 +427,8 @@ def TLS_base_addr32 : I<0, Pseudo, (outs
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
 let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],

Modified: llvm/trunk/lib/Target/X86/X86InstrFPStack.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFPStack.td?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFPStack.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFPStack.td Fri Aug  1 17:19:41 2014
@@ -114,9 +114,6 @@ let usesCustomInserter = 1 in {  // Expa
 // a pattern) and the FPI instruction should have emission info (e.g. opcode
 // encoding and asm printing info).
 
-// Pseudo Instruction for FP stack return values.
-def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
-
 // FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
 // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
 // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.

Modified: llvm/trunk/lib/Target/X86/X86RegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86RegisterInfo.td?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86RegisterInfo.td (original)
+++ llvm/trunk/lib/Target/X86/X86RegisterInfo.td Fri Aug  1 17:19:41 2014
@@ -166,6 +166,7 @@ def FP3 : X86Reg<"fp3", 0>;
 def FP4 : X86Reg<"fp4", 0>;
 def FP5 : X86Reg<"fp5", 0>;
 def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
 
 // XMM Registers, used by the various SSE instruction set extensions.
 def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
@@ -234,22 +235,18 @@ let SubRegIndices = [sub_ymm] in {
   def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
   def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
 
-class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
-  let Aliases = A;
-}
-
 // Floating point stack registers. These don't map one-to-one to the FP
 // pseudo registers, but we still mark them as aliasing FP registers. That
 // way both kinds can be live without exceeding the stack depth. ST registers
 // are only live around inline assembly.
-def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
-def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
-def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
-def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
-def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
-def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
-def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
-def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
 
 // Floating-point status word
 def FPSW : X86Reg<"fpsw", 0>;

Modified: llvm/trunk/test/CodeGen/X86/inline-asm-fpstack.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/inline-asm-fpstack.ll?rev=214580&r1=214579&r2=214580&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/inline-asm-fpstack.ll (original)
+++ llvm/trunk/test/CodeGen/X86/inline-asm-fpstack.ll Fri Aug  1 17:19:41 2014
@@ -340,3 +340,65 @@ entry:
   %0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
   ret i32 %0
 }
+
+; <rdar://problem/16952634>
+; X87 stackifier asserted when there was an ST register defined by an
+; inline-asm instruction and the ST register was live across another
+; inline-asm instruction.
+;
+; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; %FP0<def> = COPY %ST0
+
+; CHECK-LABEL: _test_live_st
+; CHECK: ## InlineAsm Start
+; CHECK: frndint
+; CHECK: ## InlineAsm End
+; CHECK: ## InlineAsm Start
+; CHECK: fldcw
+; CHECK: ## InlineAsm End
+
+%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, i32, i32, i32 }
+
+ at fpu = external global %struct.fpu_t, align 16
+
+; Function Attrs: ssp
+define void @test_live_st(i32 %a1) {
+entry:
+  %0 = load x86_fp80* undef, align 16
+  %cond = icmp eq i32 %a1, 1
+  br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit
+
+sw.bb4.i:
+  %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0)
+  call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef)
+  br label %_Z5tointRKe.exit
+
+_Z5tointRKe.exit:
+  %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ]
+  %conv.i1814 = fptosi x86_fp80 %result.0.i to i32
+  %conv626 = sitofp i32 %conv.i1814 to x86_fp80
+  store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1)
+  br label %return
+
+return:
+  ret void
+}
+
+; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
+;
+; CHECK-LABEL: _test_operand_rewrite
+; CHECK: ## InlineAsm Start
+; CHECK: foo %st(0), %st(1)
+; CHECK: ## InlineAsm End
+
+define double @test_operand_rewrite() {
+entry:
+  %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { double, double } %0, 0
+  %asmresult1 = extractvalue { double, double } %0, 1
+  %sub = fsub double %asmresult, %asmresult1
+  ret double %sub
+}