[llvm] r240257 - [X86] Allow more call sequences to use push instructions for argument passing

Wed Jul 15 18:31:13 PDT 2015

I ended up reverting the change in r242373, as disabling
X86CallFrameOptimization altogether impacts other test cases (inalloca).

On Wed, Jul 15, 2015 at 6:11 PM, Reid Kleckner <rnk at google.com> wrote:

> The size win from this is pretty exciting, but it's miscompiling a
> function in Chromium:
>
> https://code.google.com/p/chromium/codesearch#chromium/src/third_party/libjingle/source/talk/session/media/mediasession.cc&l=606
>
> On the bright side, it's only one!
>
> We end up with a fastcc function (taking parameters in ecx / edx) calling
> a thiscall function in the entry block, and somehow the stack adjustment
> ends up in the wrong place:
>         pushl   %ebp
>         movl    %esp, %ebp
>         pushl   %ebx
>         pushl   %edi
>         pushl   %esi  <---- end pushing non-volatile regs
>         pushl   %edx <----- set up second argument to GetContentByName,
> ecx is already arranged
>         subl    $48, %esp <---- BUG: stack allocation here!?
>         calll   "?GetContentByName at SessionDescription@cricket@
> @QAEPAUContentInfo at 2@ABV?$basic_string at DU?$char_traits at D@std@
> @V?$allocator at D@2@@std@@@Z"
>
> I've attached the extracted function and you can see the bug by inspection
> with llc.
>
> I'm going to flip the sense of the call frame optimization command line
> flag so that it's off by default for now. Feel free to flip it back when
> you sort this out.
>
> On Mon, Jun 22, 2015 at 1:31 AM, Michael Kuperstein <
> michael.m.kuperstein at intel.com> wrote:
>
>> Author: mkuper
>> Date: Mon Jun 22 03:31:22 2015
>> New Revision: 240257
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=240257&view=rev
>> Log:
>> [X86] Allow more call sequences to use push instructions for argument
>> passing
>>
>> This allows more call sequences to use pushes instead of movs when
>> optimizing for size.
>> In particular, calling conventions that pass some parameters in registers
>> (e.g. thiscall) are now supported.
>>
>> Differential Revision: http://reviews.llvm.org/D10500
>>
>> Modified:
>>     llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
>>     llvm/trunk/test/CodeGen/X86/movtopush.ll
>>
>> Modified: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp?rev=240257&r1=240256&r2=240257&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp Mon Jun 22
>> 03:31:22 2015
>> @@ -78,7 +78,7 @@ private:
>>    typedef DenseMap<MachineInstr *, CallContext> ContextMap;
>>
>>    bool isLegal(MachineFunction &MF);
>> -
>> +
>>    bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap);
>>
>>    void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
>> @@ -90,6 +90,13 @@ private:
>>    MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator
>> FrameSetup,
>>                                     unsigned Reg);
>>
>> +  enum InstClassification { Convert, Skip, Exit };
>> +
>> +  InstClassification classifyInstruction(MachineBasicBlock &MBB,
>> +                                         MachineBasicBlock::iterator MI,
>> +                                         const X86RegisterInfo &RegInfo,
>> +                                         DenseSet<unsigned int>
>> &UsedRegs);
>> +
>>    const char *getPassName() const override { return "X86 Optimize Call
>> Frame"; }
>>
>>    const TargetInstrInfo *TII;
>> @@ -105,7 +112,7 @@ FunctionPass *llvm::createX86CallFrameOp
>>    return new X86CallFrameOptimization();
>>  }
>>
>> -// This checks whether the transformation is legal.
>> +// This checks whether the transformation is legal.
>>  // Also returns false in cases where it's potentially legal, but
>>  // we don't even want to try.
>>  bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
>> @@ -170,9 +177,8 @@ bool X86CallFrameOptimization::isProfita
>>    if (!OptForSize)
>>      return false;
>>
>> -
>>    unsigned StackAlign = TFL->getStackAlignment();
>> -
>> +
>>    int64_t Advantage = 0;
>>    for (auto CC : CallSeqMap) {
>>      // Call sites where no parameters are passed on the stack
>> @@ -205,7 +211,6 @@ bool X86CallFrameOptimization::isProfita
>>    return (Advantage >= 0);
>>  }
>>
>> -
>>  bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF)
>> {
>>    TII = MF.getSubtarget().getInstrInfo();
>>    TFL = MF.getSubtarget().getFrameLowering();
>> @@ -237,6 +242,64 @@ bool X86CallFrameOptimization::runOnMach
>>    return Changed;
>>  }
>>
>> +X86CallFrameOptimization::InstClassification
>> +X86CallFrameOptimization::classifyInstruction(
>> +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
>> +    const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
>> +  if (MI == MBB.end())
>> +    return Exit;
>> +
>> +  // The instructions we actually care about are movs onto the stack
>> +  int Opcode = MI->getOpcode();
>> +  if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
>> +    return Convert;
>> +
>> +  // Not all calling conventions have only stack MOVs between the stack
>> +  // adjust and the call.
>> +
>> +  // We want to tolerate other instructions, to cover more cases.
>> +  // In particular:
>> +  // a) PCrel calls, where we expect an additional COPY of the basereg.
>> +  // b) Passing frame-index addresses.
>> +  // c) Calling conventions that have inreg parameters. These generate
>> +  //    both copies and movs into registers.
>> +  // To avoid creating lots of special cases, allow any instruction
>> +  // that does not write into memory, does not def or use the stack
>> +  // pointer, and does not def any register that was used by a preceding
>> +  // push.
>> +  // (Reading from memory is allowed, even if referenced through a
>> +  // frame index, since these will get adjusted properly in PEI)
>> +
>> +  // The reason for the last condition is that the pushes can't replace
>> +  // the movs in place, because the order must be reversed.
>> +  // So if we have a MOV32mr that uses EDX, then an instruction that defs
>> +  // EDX, and then the call, after the transformation the push will use
>> +  // the modified version of EDX, and not the original one.
>> +  // Since we are still in SSA form at this point, we only need to
>> +  // make sure we don't clobber any *physical* registers that were
>> +  // used by an earlier mov that will become a push.
>> +
>> +  if (MI->isCall() || MI->mayStore())
>> +    return Exit;
>> +
>> +  for (const MachineOperand &MO : MI->operands()) {
>> +    if (!MO.isReg())
>> +      continue;
>> +    unsigned int Reg = MO.getReg();
>> +    if (!RegInfo.isPhysicalRegister(Reg))
>> +      continue;
>> +    if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
>> +      return Exit;
>> +    if (MO.isDef()) {
>> +      for (unsigned int U : UsedRegs)
>> +        if (RegInfo.regsOverlap(Reg, U))
>> +          return Exit;
>> +    }
>> +  }
>> +
>> +  return Skip;
>> +}
>> +
>>  void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
>>                                                 MachineBasicBlock &MBB,
>>
>> MachineBasicBlock::iterator I,
>> @@ -254,8 +317,8 @@ void X86CallFrameOptimization::collectCa
>>
>>    // How much do we adjust the stack? This puts an upper bound on
>>    // the number of parameters actually passed on it.
>> -  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
>> -
>> +  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
>> +
>>    // A zero adjustment means no stack parameters
>>    if (!MaxAdjust) {
>>      Context.NoStackParams = true;
>> @@ -284,11 +347,17 @@ void X86CallFrameOptimization::collectCa
>>    if (MaxAdjust > 4)
>>      Context.MovVector.resize(MaxAdjust, nullptr);
>>
>> -  do {
>> -    int Opcode = I->getOpcode();
>> -    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
>> -      break;
>> +  InstClassification Classification;
>> +  DenseSet<unsigned int> UsedRegs;
>>
>> +  while ((Classification = classifyInstruction(MBB, I, RegInfo,
>> UsedRegs)) !=
>> +         Exit) {
>> +    if (Classification == Skip) {
>> +      ++I;
>> +      continue;
>> +    }
>> +
>> +    // We know the instruction is a MOV32mi/MOV32mr.
>>      // We only want movs of the form:
>>      // movl imm/r32, k(%esp)
>>      // If we run into something else, bail.
>> @@ -323,24 +392,20 @@ void X86CallFrameOptimization::collectCa
>>        return;
>>      Context.MovVector[StackDisp] = I;
>>
>> -    ++I;
>> -  } while (I != MBB.end());
>> -
>> -  // We now expect the end of the sequence - a call and a stack adjust.
>> -  if (I == MBB.end())
>> -    return;
>> +    for (const MachineOperand &MO : I->uses()) {
>> +      if (!MO.isReg())
>> +        continue;
>> +      unsigned int Reg = MO.getReg();
>> +      if (RegInfo.isPhysicalRegister(Reg))
>> +        UsedRegs.insert(Reg);
>> +    }
>>
>> -  // For PCrel calls, we expect an additional COPY of the basereg.
>> -  // If we find one, skip it.
>> -  if (I->isCopy()) {
>> -    if (I->getOperand(1).getReg() ==
>> -        MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
>> -      ++I;
>> -    else
>> -      return;
>> +    ++I;
>>    }
>>
>> -  if (!I->isCall())
>> +  // We now expect the end of the sequence. If we stopped early,
>> +  // or reached the end of the block without finding a call, bail.
>> +  if (I == MBB.end() || !I->isCall())
>>      return;
>>
>>    Context.Call = I;
>>
>> Modified: llvm/trunk/test/CodeGen/X86/movtopush.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movtopush.ll?rev=240257&r1=240256&r2=240257&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/movtopush.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/movtopush.ll Mon Jun 22 03:31:22 2015
>> @@ -2,11 +2,15 @@
>>  ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
>>  ; RUN: llc < %s -mtriple=i686-windows -force-align-stack
>> -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
>>
>> +%class.Class = type { i32 }
>> +%struct.s = type { i64 }
>> +
>>  declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
>>  declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
>> +declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32
>> %b, i32 %c, i32 %d)
>>  declare void @oneparam(i32 %a)
>>  declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32
>> %f, i32 %g, i32 %h)
>> -
>> +declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
>>
>>  ; Here, we should have a reserved frame, so we don't expect pushes
>>  ; NORMAL-LABEL: test1:
>> @@ -108,13 +112,12 @@ entry:
>>    ret void
>>  }
>>
>> -; We don't support weird calling conventions
>> +; We support weird calling conventions
>>  ; NORMAL-LABEL: test4:
>> -; NORMAL: subl    $12, %esp
>> -; NORMAL-NEXT: movl    $4, 8(%esp)
>> -; NORMAL-NEXT: movl    $3, 4(%esp)
>> -; NORMAL-NEXT: movl    $1, (%esp)
>> -; NORMAL-NEXT: movl    $2, %eax
>> +; NORMAL: movl    $2, %eax
>> +; NORMAL-NEXT: pushl   $4
>> +; NORMAL-NEXT: pushl   $3
>> +; NORMAL-NEXT: pushl   $1
>>  ; NORMAL-NEXT: call
>>  ; NORMAL-NEXT: addl $12, %esp
>>  define void @test4() optsize {
>> @@ -123,6 +126,20 @@ entry:
>>    ret void
>>  }
>>
>> +; NORMAL-LABEL: test4b:
>> +; NORMAL: movl 4(%esp), %ecx
>> +; NORMAL-NEXT: pushl   $4
>> +; NORMAL-NEXT: pushl   $3
>> +; NORMAL-NEXT: pushl   $2
>> +; NORMAL-NEXT: pushl   $1
>> +; NORMAL-NEXT: call
>> +; NORMAL-NEXT: ret
>> +define void @test4b(%class.Class* %f) optsize {
>> +entry:
>> +  call x86_thiscallcc void @thiscall(%class.Class* %f, i32 1, i32 2, i32
>> 3, i32 4)
>> +  ret void
>> +}
>> +
>>  ; When there is no reserved call frame, check that additional alignment
>>  ; is added when the pushes don't add up to the required alignment.
>>  ; ALIGNED-LABEL: test5:
>> @@ -229,20 +246,27 @@ entry:
>>  ; NORMAL-NEXT: pushl $1
>>  ; NORMAL-NEXT: call
>>  ; NORMAL-NEXT: addl $16, %esp
>> -; NORMAL-NEXT: subl $16, %esp
>> -; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
>> -; NORMAL-NEXT: movl    [[EAX]], 12(%esp)
>> -; NORMAL-NEXT: movl    $7, 8(%esp)
>> -; NORMAL-NEXT: movl    $6, 4(%esp)
>> -; NORMAL-NEXT: movl    $5, (%esp)
>> +; NORMAL-NEXT: subl $20, %esp
>> +; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
>> +; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
>> +; NORMAL-NEXT: movl    [[E2]], 4(%esp)
>> +; NORMAL-NEXT: movl    [[E1]], (%esp)
>> +; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]]
>> +; NORMAL-NEXT: movl    [[E3]], 16(%esp)
>> +; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]]
>> +; NORMAL-NEXT: movl    [[E4]], 12(%esp)
>> +; NORMAL-NEXT: movl    $6, 8(%esp)
>>  ; NORMAL-NEXT: call
>> -; NORMAL-NEXT: addl $16, %esp
>> +; NORMAL-NEXT: addl $20, %esp
>>  define void @test9() optsize {
>>  entry:
>>    %p = alloca i32, align 4
>> +  %q = alloca i32, align 4
>> +  %s = alloca %struct.s, align 4
>>    call void @good(i32 1, i32 2, i32 3, i32 4)
>> -  %0 = ptrtoint i32* %p to i32
>> -  call void @good(i32 5, i32 6, i32 7, i32 %0)
>> +  %pv = ptrtoint i32* %p to i32
>> +  %qv = ptrtoint i32* %q to i32
>> +  call void @struct(%struct.s* byval %s, i32 6, i32 %qv, i32 %pv)
>>    ret void
>>  }
>>
>> @@ -291,28 +315,17 @@ define void @test11() optsize {
>>  ; Converting one mov into a push isn't worth it when
>>  ; doing so forces too much overhead for other calls.
>>  ; NORMAL-LABEL: test12:
>> -; NORMAL: subl    $16, %esp
>> -; NORMAL-NEXT: movl    $4, 8(%esp)
>> -; NORMAL-NEXT: movl    $3, 4(%esp)
>> -; NORMAL-NEXT: movl    $1, (%esp)
>> -; NORMAL-NEXT: movl    $2, %eax
>> -; NORMAL-NEXT: calll _inreg
>> -; NORMAL-NEXT: movl    $8, 12(%esp)
>> +; NORMAL: movl    $8, 12(%esp)
>>  ; NORMAL-NEXT: movl    $7, 8(%esp)
>>  ; NORMAL-NEXT: movl    $6, 4(%esp)
>>  ; NORMAL-NEXT: movl    $5, (%esp)
>>  ; NORMAL-NEXT: calll _good
>> -; NORMAL-NEXT: movl    $12, 8(%esp)
>> -; NORMAL-NEXT: movl    $11, 4(%esp)
>> -; NORMAL-NEXT: movl    $9, (%esp)
>> -; NORMAL-NEXT: movl    $10, %eax
>> -; NORMAL-NEXT: calll _inreg
>> -; NORMAL-NEXT: addl $16, %esp
>>  define void @test12() optsize {
>>  entry:
>> -  call void @inreg(i32 1, i32 2, i32 3, i32 4)
>> +  %s = alloca %struct.s, align 4
>> +  call void @struct(%struct.s* %s, i32 2, i32 3, i32 4)
>>    call void @good(i32 5, i32 6, i32 7, i32 8)
>> -  call void @inreg(i32 9, i32 10, i32 11, i32 12)
>> +  call void @struct(%struct.s* %s, i32 10, i32 11, i32 12)
>>    ret void
>>  }
>>
>> @@ -324,13 +337,12 @@ entry:
>>  ; NORMAL-NEXT: pushl    $1
>>  ; NORMAL-NEXT: calll _good
>>  ; NORMAL-NEXT: addl    $16, %esp
>> -; NORMAL-NEXT: subl    $12, %esp
>> -; NORMAL-NEXT: movl    $8, 8(%esp)
>> -; NORMAL-NEXT: movl    $7, 4(%esp)
>> -; NORMAL-NEXT: movl    $5, (%esp)
>> -; NORMAL-NEXT: movl    $6, %eax
>> -; NORMAL-NEXT: calll _inreg
>> -; NORMAL-NEXT: addl    $12, %esp
>> +; NORMAL-NEXT: subl    $20, %esp
>> +; NORMAL: movl    $8, 16(%esp)
>> +; NORMAL-NEXT: movl    $7, 12(%esp)
>> +; NORMAL-NEXT: movl    $6, 8(%esp)
>> +; NORMAL-NEXT: calll _struct
>> +; NORMAL-NEXT: addl    $20, %esp
>>  ; NORMAL-NEXT: pushl    $12
>>  ; NORMAL-NEXT: pushl    $11
>>  ; NORMAL-NEXT: pushl    $10
>> @@ -339,8 +351,9 @@ entry:
>>  ; NORMAL-NEXT: addl $16, %esp
>>  define void @test12b() optsize {
>>  entry:
>> -  call void @good(i32 1, i32 2, i32 3, i32 4)
>> -  call void @inreg(i32 5, i32 6, i32 7, i32 8)
>> +  %s = alloca %struct.s, align 4
>> +  call void @good(i32 1, i32 2, i32 3, i32 4)
>> +  call void @struct(%struct.s* %s, i32 6, i32 7, i32 8)
>>    call void @good(i32 9, i32 10, i32 11, i32 12)
>>    ret void
>>  }
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150715/1c8090ee/attachment.html>