[llvm] r242398 - [X86] Reapply r240257 : "Allow more call sequences to use push instructions for argument passing"

Thu Jul 16 06:54:14 PDT 2015

Author: mkuper
Date: Thu Jul 16 08:54:14 2015
New Revision: 242398

URL: http://llvm.org/viewvc/llvm-project?rev=242398&view=rev
Log:
[X86] Reapply r240257 : "Allow more call sequences to use push instructions for argument passing"

This allows more call sequences to use pushes instead of movs when optimizing for size.
In particular, calling conventions that pass some parameters in registers (e.g. thiscall) are now supported.

This should no longer cause miscompiles, now that a bug in emitPrologue was fixed in r242395.

Modified:
    llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
    llvm/trunk/test/CodeGen/X86/movtopush.ll

Modified: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp?rev=242398&r1=242397&r2=242398&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp Thu Jul 16 08:54:14 2015
@@ -78,7 +78,7 @@ private:
   typedef DenseMap<MachineInstr *, CallContext> ContextMap;
 
   bool isLegal(MachineFunction &MF);
-  
+
   bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap);
 
   void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -90,6 +90,13 @@ private:
   MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
                                    unsigned Reg);
 
+  enum InstClassification { Convert, Skip, Exit };
+
+  InstClassification classifyInstruction(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         const X86RegisterInfo &RegInfo,
+                                         DenseSet<unsigned int> &UsedRegs);
+
   const char *getPassName() const override { return "X86 Optimize Call Frame"; }
 
   const TargetInstrInfo *TII;
@@ -105,7 +112,7 @@ FunctionPass *llvm::createX86CallFrameOp
   return new X86CallFrameOptimization();
 }
 
-// This checks whether the transformation is legal. 
+// This checks whether the transformation is legal.
 // Also returns false in cases where it's potentially legal, but
 // we don't even want to try.
 bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
@@ -170,9 +177,8 @@ bool X86CallFrameOptimization::isProfita
   if (!OptForSize)
     return false;
 
-
   unsigned StackAlign = TFL->getStackAlignment();
-  
+
   int64_t Advantage = 0;
   for (auto CC : CallSeqMap) {
     // Call sites where no parameters are passed on the stack
@@ -205,7 +211,6 @@ bool X86CallFrameOptimization::isProfita
   return (Advantage >= 0);
 }
 
-
 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TFL = MF.getSubtarget().getFrameLowering();
@@ -237,6 +242,64 @@ bool X86CallFrameOptimization::runOnMach
   return Changed;
 }
 
+X86CallFrameOptimization::InstClassification
+X86CallFrameOptimization::classifyInstruction(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
+  if (MI == MBB.end())
+    return Exit;
+
+  // The instructions we actually care about are movs onto the stack
+  int Opcode = MI->getOpcode();
+  if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
+    return Convert;
+
+  // Not all calling conventions have only stack MOVs between the stack
+  // adjust and the call.
+
+  // We want to tolerate other instructions, to cover more cases.
+  // In particular:
+  // a) PCrel calls, where we expect an additional COPY of the basereg.
+  // b) Passing frame-index addresses.
+  // c) Calling conventions that have inreg parameters. These generate
+  //    both copies and movs into registers.
+  // To avoid creating lots of special cases, allow any instruction
+  // that does not write into memory, does not def or use the stack
+  // pointer, and does not def any register that was used by a preceding
+  // push.
+  // (Reading from memory is allowed, even if referenced through a
+  // frame index, since these will get adjusted properly in PEI)
+
+  // The reason for the last condition is that the pushes can't replace
+  // the movs in place, because the order must be reversed.
+  // So if we have a MOV32mr that uses EDX, then an instruction that defs
+  // EDX, and then the call, after the transformation the push will use
+  // the modified version of EDX, and not the original one.
+  // Since we are still in SSA form at this point, we only need to
+  // make sure we don't clobber any *physical* registers that were
+  // used by an earlier mov that will become a push.
+
+  if (MI->isCall() || MI->mayStore())
+    return Exit;
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned int Reg = MO.getReg();
+    if (!RegInfo.isPhysicalRegister(Reg))
+      continue;
+    if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
+      return Exit;
+    if (MO.isDef()) {
+      for (unsigned int U : UsedRegs)
+        if (RegInfo.regsOverlap(Reg, U))
+          return Exit;
+    }
+  }
+
+  return Skip;
+}
+
 void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
                                                MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator I,
@@ -254,8 +317,8 @@ void X86CallFrameOptimization::collectCa
 
   // How much do we adjust the stack? This puts an upper bound on
   // the number of parameters actually passed on it.
-  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;  
-  
+  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+
   // A zero adjustment means no stack parameters
   if (!MaxAdjust) {
     Context.NoStackParams = true;
@@ -284,11 +347,17 @@ void X86CallFrameOptimization::collectCa
   if (MaxAdjust > 4)
     Context.MovVector.resize(MaxAdjust, nullptr);
 
-  do {
-    int Opcode = I->getOpcode();
-    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
-      break;
+  InstClassification Classification;
+  DenseSet<unsigned int> UsedRegs;
 
+  while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
+         Exit) {
+    if (Classification == Skip) {
+      ++I;
+      continue;
+    }
+
+    // We know the instruction is a MOV32mi/MOV32mr.
     // We only want movs of the form:
     // movl imm/r32, k(%esp)
     // If we run into something else, bail.
@@ -323,24 +392,20 @@ void X86CallFrameOptimization::collectCa
       return;
     Context.MovVector[StackDisp] = I;
 
-    ++I;
-  } while (I != MBB.end());
-
-  // We now expect the end of the sequence - a call and a stack adjust.
-  if (I == MBB.end())
-    return;
+    for (const MachineOperand &MO : I->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned int Reg = MO.getReg();
+      if (RegInfo.isPhysicalRegister(Reg))
+        UsedRegs.insert(Reg);
+    }
 
-  // For PCrel calls, we expect an additional COPY of the basereg.
-  // If we find one, skip it.
-  if (I->isCopy()) {
-    if (I->getOperand(1).getReg() ==
-        MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
-      ++I;
-    else
-      return;
+    ++I;
   }
 
-  if (!I->isCall())
+  // We now expect the end of the sequence. If we stopped early,
+  // or reached the end of the block without finding a call, bail.
+  if (I == MBB.end() || !I->isCall())
     return;
 
   Context.Call = I;

Modified: llvm/trunk/test/CodeGen/X86/movtopush.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movtopush.ll?rev=242398&r1=242397&r2=242398&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/movtopush.ll (original)
+++ llvm/trunk/test/CodeGen/X86/movtopush.ll Thu Jul 16 08:54:14 2015
@@ -2,11 +2,15 @@
 ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
 ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
 
+%class.Class = type { i32 }
+%struct.s = type { i64 }
+
 declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
 declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
+declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32 %c, i32 %d)
 declare void @oneparam(i32 %a)
 declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
-
+declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
 
 ; Here, we should have a reserved frame, so we don't expect pushes
 ; NORMAL-LABEL: test1:
@@ -108,13 +112,12 @@ entry:
   ret void
 }
 
-; We don't support weird calling conventions
+; We support weird calling conventions
 ; NORMAL-LABEL: test4:
-; NORMAL: subl    $12, %esp
-; NORMAL-NEXT: movl    $4, 8(%esp)
-; NORMAL-NEXT: movl    $3, 4(%esp)
-; NORMAL-NEXT: movl    $1, (%esp)
-; NORMAL-NEXT: movl    $2, %eax
+; NORMAL: movl    $2, %eax
+; NORMAL-NEXT: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $1
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $12, %esp
 define void @test4() optsize {
@@ -123,6 +126,20 @@ entry:
   ret void
 }
 
+; NORMAL-LABEL: test4b:
+; NORMAL: movl 4(%esp), %ecx
+; NORMAL-NEXT: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: ret
+define void @test4b(%class.Class* %f) optsize {
+entry:
+  call x86_thiscallcc void @thiscall(%class.Class* %f, i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
 ; When there is no reserved call frame, check that additional alignment
 ; is added when the pushes don't add up to the required alignment.
 ; ALIGNED-LABEL: test5:
@@ -229,20 +246,27 @@ entry:
 ; NORMAL-NEXT: pushl $1
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $16, %esp
-; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
-; NORMAL-NEXT: movl    [[EAX]], 12(%esp)
-; NORMAL-NEXT: movl    $7, 8(%esp)
-; NORMAL-NEXT: movl    $6, 4(%esp)
-; NORMAL-NEXT: movl    $5, (%esp)
+; NORMAL-NEXT: subl $20, %esp
+; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
+; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
+; NORMAL-NEXT: movl    [[E2]], 4(%esp)
+; NORMAL-NEXT: movl    [[E1]], (%esp)
+; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]]
+; NORMAL-NEXT: movl    [[E3]], 16(%esp)
+; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]]
+; NORMAL-NEXT: movl    [[E4]], 12(%esp)
+; NORMAL-NEXT: movl    $6, 8(%esp)
 ; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: addl $20, %esp
 define void @test9() optsize {
 entry:
   %p = alloca i32, align 4
+  %q = alloca i32, align 4
+  %s = alloca %struct.s, align 4  
   call void @good(i32 1, i32 2, i32 3, i32 4)
-  %0 = ptrtoint i32* %p to i32
-  call void @good(i32 5, i32 6, i32 7, i32 %0)
+  %pv = ptrtoint i32* %p to i32
+  %qv = ptrtoint i32* %q to i32
+  call void @struct(%struct.s* byval %s, i32 6, i32 %qv, i32 %pv)
   ret void
 }
 
@@ -291,28 +315,17 @@ define void @test11() optsize {
 ; Converting one mov into a push isn't worth it when 
 ; doing so forces too much overhead for other calls.
 ; NORMAL-LABEL: test12:
-; NORMAL: subl    $16, %esp
-; NORMAL-NEXT: movl    $4, 8(%esp)
-; NORMAL-NEXT: movl    $3, 4(%esp)
-; NORMAL-NEXT: movl    $1, (%esp)
-; NORMAL-NEXT: movl    $2, %eax
-; NORMAL-NEXT: calll _inreg
-; NORMAL-NEXT: movl    $8, 12(%esp)
+; NORMAL: movl    $8, 12(%esp)
 ; NORMAL-NEXT: movl    $7, 8(%esp)
 ; NORMAL-NEXT: movl    $6, 4(%esp)
 ; NORMAL-NEXT: movl    $5, (%esp)
 ; NORMAL-NEXT: calll _good
-; NORMAL-NEXT: movl    $12, 8(%esp)
-; NORMAL-NEXT: movl    $11, 4(%esp)
-; NORMAL-NEXT: movl    $9, (%esp)
-; NORMAL-NEXT: movl    $10, %eax
-; NORMAL-NEXT: calll _inreg
-; NORMAL-NEXT: addl $16, %esp
 define void @test12() optsize {
 entry:
-  call void @inreg(i32 1, i32 2, i32 3, i32 4)
+  %s = alloca %struct.s, align 4  
+  call void @struct(%struct.s* %s, i32 2, i32 3, i32 4)
   call void @good(i32 5, i32 6, i32 7, i32 8)
-  call void @inreg(i32 9, i32 10, i32 11, i32 12)
+  call void @struct(%struct.s* %s, i32 10, i32 11, i32 12)
   ret void
 }
 
@@ -324,13 +337,12 @@ entry:
 ; NORMAL-NEXT: pushl    $1
 ; NORMAL-NEXT: calll _good
 ; NORMAL-NEXT: addl    $16, %esp
-; NORMAL-NEXT: subl    $12, %esp
-; NORMAL-NEXT: movl    $8, 8(%esp)
-; NORMAL-NEXT: movl    $7, 4(%esp)
-; NORMAL-NEXT: movl    $5, (%esp)
-; NORMAL-NEXT: movl    $6, %eax
-; NORMAL-NEXT: calll _inreg
-; NORMAL-NEXT: addl    $12, %esp
+; NORMAL-NEXT: subl    $20, %esp
+; NORMAL: movl    $8, 16(%esp)
+; NORMAL-NEXT: movl    $7, 12(%esp)
+; NORMAL-NEXT: movl    $6, 8(%esp)
+; NORMAL-NEXT: calll _struct
+; NORMAL-NEXT: addl    $20, %esp
 ; NORMAL-NEXT: pushl    $12
 ; NORMAL-NEXT: pushl    $11
 ; NORMAL-NEXT: pushl    $10
@@ -339,8 +351,9 @@ entry:
 ; NORMAL-NEXT: addl $16, %esp
 define void @test12b() optsize {
 entry:
-  call void @good(i32 1, i32 2, i32 3, i32 4)
-  call void @inreg(i32 5, i32 6, i32 7, i32 8)
+  %s = alloca %struct.s, align 4  
+  call void @good(i32 1, i32 2, i32 3, i32 4)  
+  call void @struct(%struct.s* %s, i32 6, i32 7, i32 8)
   call void @good(i32 9, i32 10, i32 11, i32 12)
   ret void
 }