[llvm] r240257 - [X86] Allow more call sequences to use push instructions for argument passing

Wed Jul 15 18:11:07 PDT 2015

The size win from this is pretty exciting, but it's miscompiling a function
in Chromium:
https://code.google.com/p/chromium/codesearch#chromium/src/third_party/libjingle/source/talk/session/media/mediasession.cc&l=606

On the bright side, it's only one!

We end up with a fastcc function (taking parameters in ecx / edx) calling a
thiscall function in the entry block, and somehow the stack adjustment ends
up in the wrong place:
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ebx
        pushl   %edi
        pushl   %esi  <---- end pushing non-volatile regs
        pushl   %edx <----- set up second argument to GetContentByName, ecx
is already arranged
        subl    $48, %esp <---- BUG: stack allocation here!?
        calll   "?GetContentByName at SessionDescription@cricket@
@QAEPAUContentInfo at 2@ABV?$basic_string at DU?$char_traits at D@std@@V?$allocator at D
@2@@std@@@Z"

I've attached the extracted function and you can see the bug by inspection
with llc.

I'm going to flip the sense of the call frame optimization command line
flag so that it's off by default for now. Feel free to flip it back when
you sort this out.

On Mon, Jun 22, 2015 at 1:31 AM, Michael Kuperstein <
michael.m.kuperstein at intel.com> wrote:

> Author: mkuper
> Date: Mon Jun 22 03:31:22 2015
> New Revision: 240257
>
> URL: http://llvm.org/viewvc/llvm-project?rev=240257&view=rev
> Log:
> [X86] Allow more call sequences to use push instructions for argument
> passing
>
> This allows more call sequences to use pushes instead of movs when
> optimizing for size.
> In particular, calling conventions that pass some parameters in registers
> (e.g. thiscall) are now supported.
>
> Differential Revision: http://reviews.llvm.org/D10500
>
> Modified:
>     llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
>     llvm/trunk/test/CodeGen/X86/movtopush.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp?rev=240257&r1=240256&r2=240257&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp Mon Jun 22
> 03:31:22 2015
> @@ -78,7 +78,7 @@ private:
>    typedef DenseMap<MachineInstr *, CallContext> ContextMap;
>
>    bool isLegal(MachineFunction &MF);
> -
> +
>    bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap);
>
>    void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
> @@ -90,6 +90,13 @@ private:
>    MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
>                                     unsigned Reg);
>
> +  enum InstClassification { Convert, Skip, Exit };
> +
> +  InstClassification classifyInstruction(MachineBasicBlock &MBB,
> +                                         MachineBasicBlock::iterator MI,
> +                                         const X86RegisterInfo &RegInfo,
> +                                         DenseSet<unsigned int>
> &UsedRegs);
> +
>    const char *getPassName() const override { return "X86 Optimize Call
> Frame"; }
>
>    const TargetInstrInfo *TII;
> @@ -105,7 +112,7 @@ FunctionPass *llvm::createX86CallFrameOp
>    return new X86CallFrameOptimization();
>  }
>
> -// This checks whether the transformation is legal.
> +// This checks whether the transformation is legal.
>  // Also returns false in cases where it's potentially legal, but
>  // we don't even want to try.
>  bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
> @@ -170,9 +177,8 @@ bool X86CallFrameOptimization::isProfita
>    if (!OptForSize)
>      return false;
>
> -
>    unsigned StackAlign = TFL->getStackAlignment();
> -
> +
>    int64_t Advantage = 0;
>    for (auto CC : CallSeqMap) {
>      // Call sites where no parameters are passed on the stack
> @@ -205,7 +211,6 @@ bool X86CallFrameOptimization::isProfita
>    return (Advantage >= 0);
>  }
>
> -
>  bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
>    TII = MF.getSubtarget().getInstrInfo();
>    TFL = MF.getSubtarget().getFrameLowering();
> @@ -237,6 +242,64 @@ bool X86CallFrameOptimization::runOnMach
>    return Changed;
>  }
>
> +X86CallFrameOptimization::InstClassification
> +X86CallFrameOptimization::classifyInstruction(
> +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
> +    const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
> +  if (MI == MBB.end())
> +    return Exit;
> +
> +  // The instructions we actually care about are movs onto the stack
> +  int Opcode = MI->getOpcode();
> +  if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
> +    return Convert;
> +
> +  // Not all calling conventions have only stack MOVs between the stack
> +  // adjust and the call.
> +
> +  // We want to tolerate other instructions, to cover more cases.
> +  // In particular:
> +  // a) PCrel calls, where we expect an additional COPY of the basereg.
> +  // b) Passing frame-index addresses.
> +  // c) Calling conventions that have inreg parameters. These generate
> +  //    both copies and movs into registers.
> +  // To avoid creating lots of special cases, allow any instruction
> +  // that does not write into memory, does not def or use the stack
> +  // pointer, and does not def any register that was used by a preceding
> +  // push.
> +  // (Reading from memory is allowed, even if referenced through a
> +  // frame index, since these will get adjusted properly in PEI)
> +
> +  // The reason for the last condition is that the pushes can't replace
> +  // the movs in place, because the order must be reversed.
> +  // So if we have a MOV32mr that uses EDX, then an instruction that defs
> +  // EDX, and then the call, after the transformation the push will use
> +  // the modified version of EDX, and not the original one.
> +  // Since we are still in SSA form at this point, we only need to
> +  // make sure we don't clobber any *physical* registers that were
> +  // used by an earlier mov that will become a push.
> +
> +  if (MI->isCall() || MI->mayStore())
> +    return Exit;
> +
> +  for (const MachineOperand &MO : MI->operands()) {
> +    if (!MO.isReg())
> +      continue;
> +    unsigned int Reg = MO.getReg();
> +    if (!RegInfo.isPhysicalRegister(Reg))
> +      continue;
> +    if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
> +      return Exit;
> +    if (MO.isDef()) {
> +      for (unsigned int U : UsedRegs)
> +        if (RegInfo.regsOverlap(Reg, U))
> +          return Exit;
> +    }
> +  }
> +
> +  return Skip;
> +}
> +
>  void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
>                                                 MachineBasicBlock &MBB,
>
> MachineBasicBlock::iterator I,
> @@ -254,8 +317,8 @@ void X86CallFrameOptimization::collectCa
>
>    // How much do we adjust the stack? This puts an upper bound on
>    // the number of parameters actually passed on it.
> -  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
> -
> +  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
> +
>    // A zero adjustment means no stack parameters
>    if (!MaxAdjust) {
>      Context.NoStackParams = true;
> @@ -284,11 +347,17 @@ void X86CallFrameOptimization::collectCa
>    if (MaxAdjust > 4)
>      Context.MovVector.resize(MaxAdjust, nullptr);
>
> -  do {
> -    int Opcode = I->getOpcode();
> -    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
> -      break;
> +  InstClassification Classification;
> +  DenseSet<unsigned int> UsedRegs;
>
> +  while ((Classification = classifyInstruction(MBB, I, RegInfo,
> UsedRegs)) !=
> +         Exit) {
> +    if (Classification == Skip) {
> +      ++I;
> +      continue;
> +    }
> +
> +    // We know the instruction is a MOV32mi/MOV32mr.
>      // We only want movs of the form:
>      // movl imm/r32, k(%esp)
>      // If we run into something else, bail.
> @@ -323,24 +392,20 @@ void X86CallFrameOptimization::collectCa
>        return;
>      Context.MovVector[StackDisp] = I;
>
> -    ++I;
> -  } while (I != MBB.end());
> -
> -  // We now expect the end of the sequence - a call and a stack adjust.
> -  if (I == MBB.end())
> -    return;
> +    for (const MachineOperand &MO : I->uses()) {
> +      if (!MO.isReg())
> +        continue;
> +      unsigned int Reg = MO.getReg();
> +      if (RegInfo.isPhysicalRegister(Reg))
> +        UsedRegs.insert(Reg);
> +    }
>
> -  // For PCrel calls, we expect an additional COPY of the basereg.
> -  // If we find one, skip it.
> -  if (I->isCopy()) {
> -    if (I->getOperand(1).getReg() ==
> -        MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
> -      ++I;
> -    else
> -      return;
> +    ++I;
>    }
>
> -  if (!I->isCall())
> +  // We now expect the end of the sequence. If we stopped early,
> +  // or reached the end of the block without finding a call, bail.
> +  if (I == MBB.end() || !I->isCall())
>      return;
>
>    Context.Call = I;
>
> Modified: llvm/trunk/test/CodeGen/X86/movtopush.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movtopush.ll?rev=240257&r1=240256&r2=240257&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/movtopush.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/movtopush.ll Mon Jun 22 03:31:22 2015
> @@ -2,11 +2,15 @@
>  ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
>  ; RUN: llc < %s -mtriple=i686-windows -force-align-stack
> -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
>
> +%class.Class = type { i32 }
> +%struct.s = type { i64 }
> +
>  declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
>  declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
> +declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32
> %b, i32 %c, i32 %d)
>  declare void @oneparam(i32 %a)
>  declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
> i32 %g, i32 %h)
> -
> +declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
>
>  ; Here, we should have a reserved frame, so we don't expect pushes
>  ; NORMAL-LABEL: test1:
> @@ -108,13 +112,12 @@ entry:
>    ret void
>  }
>
> -; We don't support weird calling conventions
> +; We support weird calling conventions
>  ; NORMAL-LABEL: test4:
> -; NORMAL: subl    $12, %esp
> -; NORMAL-NEXT: movl    $4, 8(%esp)
> -; NORMAL-NEXT: movl    $3, 4(%esp)
> -; NORMAL-NEXT: movl    $1, (%esp)
> -; NORMAL-NEXT: movl    $2, %eax
> +; NORMAL: movl    $2, %eax
> +; NORMAL-NEXT: pushl   $4
> +; NORMAL-NEXT: pushl   $3
> +; NORMAL-NEXT: pushl   $1
>  ; NORMAL-NEXT: call
>  ; NORMAL-NEXT: addl $12, %esp
>  define void @test4() optsize {
> @@ -123,6 +126,20 @@ entry:
>    ret void
>  }
>
> +; NORMAL-LABEL: test4b:
> +; NORMAL: movl 4(%esp), %ecx
> +; NORMAL-NEXT: pushl   $4
> +; NORMAL-NEXT: pushl   $3
> +; NORMAL-NEXT: pushl   $2
> +; NORMAL-NEXT: pushl   $1
> +; NORMAL-NEXT: call
> +; NORMAL-NEXT: ret
> +define void @test4b(%class.Class* %f) optsize {
> +entry:
> +  call x86_thiscallcc void @thiscall(%class.Class* %f, i32 1, i32 2, i32
> 3, i32 4)
> +  ret void
> +}
> +
>  ; When there is no reserved call frame, check that additional alignment
>  ; is added when the pushes don't add up to the required alignment.
>  ; ALIGNED-LABEL: test5:
> @@ -229,20 +246,27 @@ entry:
>  ; NORMAL-NEXT: pushl $1
>  ; NORMAL-NEXT: call
>  ; NORMAL-NEXT: addl $16, %esp
> -; NORMAL-NEXT: subl $16, %esp
> -; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
> -; NORMAL-NEXT: movl    [[EAX]], 12(%esp)
> -; NORMAL-NEXT: movl    $7, 8(%esp)
> -; NORMAL-NEXT: movl    $6, 4(%esp)
> -; NORMAL-NEXT: movl    $5, (%esp)
> +; NORMAL-NEXT: subl $20, %esp
> +; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
> +; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
> +; NORMAL-NEXT: movl    [[E2]], 4(%esp)
> +; NORMAL-NEXT: movl    [[E1]], (%esp)
> +; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]]
> +; NORMAL-NEXT: movl    [[E3]], 16(%esp)
> +; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]]
> +; NORMAL-NEXT: movl    [[E4]], 12(%esp)
> +; NORMAL-NEXT: movl    $6, 8(%esp)
>  ; NORMAL-NEXT: call
> -; NORMAL-NEXT: addl $16, %esp
> +; NORMAL-NEXT: addl $20, %esp
>  define void @test9() optsize {
>  entry:
>    %p = alloca i32, align 4
> +  %q = alloca i32, align 4
> +  %s = alloca %struct.s, align 4
>    call void @good(i32 1, i32 2, i32 3, i32 4)
> -  %0 = ptrtoint i32* %p to i32
> -  call void @good(i32 5, i32 6, i32 7, i32 %0)
> +  %pv = ptrtoint i32* %p to i32
> +  %qv = ptrtoint i32* %q to i32
> +  call void @struct(%struct.s* byval %s, i32 6, i32 %qv, i32 %pv)
>    ret void
>  }
>
> @@ -291,28 +315,17 @@ define void @test11() optsize {
>  ; Converting one mov into a push isn't worth it when
>  ; doing so forces too much overhead for other calls.
>  ; NORMAL-LABEL: test12:
> -; NORMAL: subl    $16, %esp
> -; NORMAL-NEXT: movl    $4, 8(%esp)
> -; NORMAL-NEXT: movl    $3, 4(%esp)
> -; NORMAL-NEXT: movl    $1, (%esp)
> -; NORMAL-NEXT: movl    $2, %eax
> -; NORMAL-NEXT: calll _inreg
> -; NORMAL-NEXT: movl    $8, 12(%esp)
> +; NORMAL: movl    $8, 12(%esp)
>  ; NORMAL-NEXT: movl    $7, 8(%esp)
>  ; NORMAL-NEXT: movl    $6, 4(%esp)
>  ; NORMAL-NEXT: movl    $5, (%esp)
>  ; NORMAL-NEXT: calll _good
> -; NORMAL-NEXT: movl    $12, 8(%esp)
> -; NORMAL-NEXT: movl    $11, 4(%esp)
> -; NORMAL-NEXT: movl    $9, (%esp)
> -; NORMAL-NEXT: movl    $10, %eax
> -; NORMAL-NEXT: calll _inreg
> -; NORMAL-NEXT: addl $16, %esp
>  define void @test12() optsize {
>  entry:
> -  call void @inreg(i32 1, i32 2, i32 3, i32 4)
> +  %s = alloca %struct.s, align 4
> +  call void @struct(%struct.s* %s, i32 2, i32 3, i32 4)
>    call void @good(i32 5, i32 6, i32 7, i32 8)
> -  call void @inreg(i32 9, i32 10, i32 11, i32 12)
> +  call void @struct(%struct.s* %s, i32 10, i32 11, i32 12)
>    ret void
>  }
>
> @@ -324,13 +337,12 @@ entry:
>  ; NORMAL-NEXT: pushl    $1
>  ; NORMAL-NEXT: calll _good
>  ; NORMAL-NEXT: addl    $16, %esp
> -; NORMAL-NEXT: subl    $12, %esp
> -; NORMAL-NEXT: movl    $8, 8(%esp)
> -; NORMAL-NEXT: movl    $7, 4(%esp)
> -; NORMAL-NEXT: movl    $5, (%esp)
> -; NORMAL-NEXT: movl    $6, %eax
> -; NORMAL-NEXT: calll _inreg
> -; NORMAL-NEXT: addl    $12, %esp
> +; NORMAL-NEXT: subl    $20, %esp
> +; NORMAL: movl    $8, 16(%esp)
> +; NORMAL-NEXT: movl    $7, 12(%esp)
> +; NORMAL-NEXT: movl    $6, 8(%esp)
> +; NORMAL-NEXT: calll _struct
> +; NORMAL-NEXT: addl    $20, %esp
>  ; NORMAL-NEXT: pushl    $12
>  ; NORMAL-NEXT: pushl    $11
>  ; NORMAL-NEXT: pushl    $10
> @@ -339,8 +351,9 @@ entry:
>  ; NORMAL-NEXT: addl $16, %esp
>  define void @test12b() optsize {
>  entry:
> -  call void @good(i32 1, i32 2, i32 3, i32 4)
> -  call void @inreg(i32 5, i32 6, i32 7, i32 8)
> +  %s = alloca %struct.s, align 4
> +  call void @good(i32 1, i32 2, i32 3, i32 4)
> +  call void @struct(%struct.s* %s, i32 6, i32 7, i32 8)
>    call void @good(i32 9, i32 10, i32 11, i32 12)
>    ret void
>  }
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150715/6e327648/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: extracted.ll
Type: application/octet-stream
Size: 15862 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150715/6e327648/attachment.obj>