[PATCH] [X86] Convert esp-relative movs of function arguments to pushes, step 2

Sun Dec 28 09:39:19 PST 2014

This patch shrinks FreeBSD boot loader by 88 bytes (roughly 1%). Note there's
hard limit on the size of the loader so this is very important for us :)

You can find the code (in a single file here):

        http://www.andric.com/freebsd/clang/boot2-minimal.c

We compile it without GVN (our local hack) and these CFLAGS:

-Oz  -fomit-frame-pointer  -mrtd  -mregparm=3 -march=i386 -ffreestanding -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 -msoft-float -m32 -std=gnu99 -mstack-alignment=8 -mllvm -inline-threshold=3 -mllvm -simplifycfg-dup-ret

Feel free to improve it even more :)

Thanks! Roman

On Sun, Dec 28, 2014 at 02:11:42PM +0000, Michael Kuperstein wrote:
> Hi nadav, rnk, delena,
> 
> This is a first stab at the next step of the mov-to-push transformation.
> 
> It moves the transformation earlier in the pass order so that it can do load-folding, and prepares the required infrastructure. 
> It is still enabled only in cases where it should be a clear win - when we don't expect to have a reserved call frame, or when optimizing for size. 
> The next step will be a heuristic that makes a smarter decision on when this should be enabled. 
> 
> As a side note - I've done some internal testing for effects on the code size, but I'd like to do some testing for things other people care about as well. So, if you have a x86-32 code-base where you care about the code size, and is publicly available, let me know.
> 
> http://reviews.llvm.org/D6789
> 
> Files:
>   lib/Target/X86/CMakeLists.txt
>   lib/Target/X86/X86.h
>   lib/Target/X86/X86ConvertMovsToPushes.cpp
>   lib/Target/X86/X86FastISel.cpp
>   lib/Target/X86/X86FrameLowering.cpp
>   lib/Target/X86/X86FrameLowering.h
>   lib/Target/X86/X86InstrCompiler.td
>   lib/Target/X86/X86MachineFunctionInfo.h
>   lib/Target/X86/X86TargetMachine.cpp
>   test/CodeGen/X86/inalloca-invoke.ll
>   test/CodeGen/X86/movtopush.ll
> 
> EMAIL PREFERENCES
>   http://reviews.llvm.org/settings/panel/emailpreferences/

> Index: test/CodeGen/X86/movtopush.ll
> ===================================================================
> --- test/CodeGen/X86/movtopush.ll
> +++ test/CodeGen/X86/movtopush.ll
> @@ -1,10 +1,11 @@
>  ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
> +; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
>  ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
>  declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
>  declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
>  
>  ; Here, we should have a reserved frame, so we don't expect pushes
> -; NORMAL-LABEL: test1
> +; NORMAL-LABEL: test1:
>  ; NORMAL: subl    $16, %esp
>  ; NORMAL-NEXT: movl    $4, 12(%esp)
>  ; NORMAL-NEXT: movl    $3, 8(%esp)
> @@ -17,8 +18,43 @@
>    ret void
>  }
>  
> -; Here, we expect a sequence of 4 immediate pushes
> -; NORMAL-LABEL: test2
> +; We're optimizing for code size, so we should get pushes for x86.
> +; Make sure we don't touch x86-64
> +; NORMAL-LABEL: test1b:
> +; NORMAL-NOT: subl {{.*}} %esp
> +; NORMAL: pushl   $4
> +; NORMAL-NEXT: pushl   $3
> +; NORMAL-NEXT: pushl   $2
> +; NORMAL-NEXT: pushl   $1
> +; NORMAL-NEXT: call
> +; X64-LABEL: test1b:
> +; X64: movl    $1, %ecx
> +; X64-NEXT: movl    $2, %edx
> +; X64-NEXT: movl    $3, %r8d
> +; X64-NEXT: movl    $4, %r9d
> +; X64-NEXT: callq   good
> +define void @test1b() optsize {
> +entry:
> +  call void @good(i32 1, i32 2, i32 3, i32 4)
> +  ret void
> +}
> +
> +; Same as above, but for minsize
> +; NORMAL-LABEL: test1c:
> +; NORMAL-NOT: subl {{.*}} %esp
> +; NORMAL: pushl   $4
> +; NORMAL-NEXT: pushl   $3
> +; NORMAL-NEXT: pushl   $2
> +; NORMAL-NEXT: pushl   $1
> +; NORMAL-NEXT: call
> +define void @test1c() minsize {
> +entry:
> +  call void @good(i32 1, i32 2, i32 3, i32 4)
> +  ret void
> +}
> +
> +; If we have a reserved frame, we should have pushes
> +; NORMAL-LABEL: test2:
>  ; NORMAL-NOT: subl {{.*}} %esp
>  ; NORMAL: pushl   $4
>  ; NORMAL-NEXT: pushl   $3
> @@ -34,53 +70,50 @@
>  
>  ; Again, we expect a sequence of 4 immediate pushes
>  ; Checks that we generate the right pushes for >8bit immediates
> -; NORMAL-LABEL: test2b
> +; NORMAL-LABEL: test2b:
>  ; NORMAL-NOT: subl {{.*}} %esp
>  ; NORMAL: pushl   $4096
>  ; NORMAL-NEXT: pushl   $3072
>  ; NORMAL-NEXT: pushl   $2048
>  ; NORMAL-NEXT: pushl   $1024
>  ; NORMAL-NEXT: call
> -define void @test2b(i32 %k) {
> +define void @test2b() optsize {
>  entry:
> -  %a = alloca i32, i32 %k
>    call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
>    ret void
>  }
>  
>  ; The first push should push a register
> -; NORMAL-LABEL: test3
> +; NORMAL-LABEL: test3:
>  ; NORMAL-NOT: subl {{.*}} %esp
>  ; NORMAL: pushl   $4
>  ; NORMAL-NEXT: pushl   $3
>  ; NORMAL-NEXT: pushl   $2
>  ; NORMAL-NEXT: pushl   %e{{..}}
>  ; NORMAL-NEXT: call
> -define void @test3(i32 %k) {
> +define void @test3(i32 %k) optsize {
>  entry:
> -  %a = alloca i32, i32 %k
>    call void @good(i32 %k, i32 2, i32 3, i32 4)
>    ret void
>  }
>  
>  ; We don't support weird calling conventions
> -; NORMAL-LABEL: test4
> +; NORMAL-LABEL: test4:
>  ; NORMAL: subl    $12, %esp
>  ; NORMAL-NEXT: movl    $4, 8(%esp)
>  ; NORMAL-NEXT: movl    $3, 4(%esp)
>  ; NORMAL-NEXT: movl    $1, (%esp)
>  ; NORMAL-NEXT: movl    $2, %eax
>  ; NORMAL-NEXT: call
> -define void @test4(i32 %k) {
> +define void @test4() optsize {
>  entry:
> -  %a = alloca i32, i32 %k
>    call void @inreg(i32 1, i32 2, i32 3, i32 4)
>    ret void
>  }
>  
> -; Check that additional alignment is added when the pushes
> -; don't add up to the required alignment.
> -; ALIGNED-LABEL: test5
> +; When there is no reserved call frame, check that additional alignment
> +; is added when the pushes don't add up to the required alignment.
> +; ALIGNED-LABEL: test5:
>  ; ALIGNED: subl    $16, %esp
>  ; ALIGNED-NEXT: pushl   $4
>  ; ALIGNED-NEXT: pushl   $3
> @@ -97,7 +130,7 @@
>  ; Check that pushing the addresses of globals (Or generally, things that 
>  ; aren't exactly immediates) isn't broken.
>  ; Fixes PR21878.
> -; NORMAL-LABEL: test6
> +; NORMAL-LABEL: test6:
>  ; NORMAL: pushl    $_ext
>  ; NORMAL-NEXT: call
>  declare void @f(i8*)
> @@ -110,3 +143,36 @@
>    alloca i32
>    ret void
>  }
> +
> +; Check that we fold simple cases into the push
> +; NORMAL-LABEL: test7:
> +; NORMAL-NOT: subl {{.*}} %esp
> +; NORMAL: movl 4(%esp), [[EAX:%e..]]
> +; NORMAL-NEXT: pushl   $4
> +; NORMAL-NEXT: pushl   ([[EAX]])
> +; NORMAL-NEXT: pushl   $2
> +; NORMAL-NEXT: pushl   $1
> +; NORMAL-NEXT: call
> +define void @test7(i32* %ptr) optsize {
> +entry:
> +  %val = load i32* %ptr
> +  call void @good(i32 1, i32 2, i32 %val, i32 4)
> +  ret void
> +}
> +
> +; But we don't want to fold stack-relative loads into the push,
> +; because the offset will be wrong
> +; NORMAL-LABEL: test8:
> +; NORMAL-NOT: subl {{.*}} %esp
> +; NORMAL: movl 4(%esp), [[EAX:%e..]]
> +; NORMAL-NEXT: pushl   $4
> +; NORMAL-NEXT: pushl   [[EAX]]
> +; NORMAL-NEXT: pushl   $2
> +; NORMAL-NEXT: pushl   $1
> +; NORMAL-NEXT: call
> +define void @test8(i32* %ptr) optsize {
> +entry:
> +  %val = ptrtoint i32* %ptr to i32
> +  call void @good(i32 1, i32 2, i32 %val, i32 4)
> +  ret void
> +}
> \ No newline at end of file
> Index: test/CodeGen/X86/inalloca-invoke.ll
> ===================================================================
> --- test/CodeGen/X86/inalloca-invoke.ll
> +++ test/CodeGen/X86/inalloca-invoke.ll
> @@ -31,7 +31,7 @@
>            to label %invoke.cont unwind label %lpad
>  
>  ;  Uses end as sret param.
> -; CHECK:  movl %[[end]], (%esp)
> +; CHECK:  pushl %[[end]]
>  ; CHECK:  calll _plus
>  
>  invoke.cont:
> Index: lib/Target/X86/X86MachineFunctionInfo.h
> ===================================================================
> --- lib/Target/X86/X86MachineFunctionInfo.h
> +++ lib/Target/X86/X86MachineFunctionInfo.h
> @@ -77,6 +77,9 @@
>    unsigned ArgumentStackSize;
>    /// NumLocalDynamics - Number of local-dynamic TLS accesses.
>    unsigned NumLocalDynamics;
> +  /// HasPushSequences - Keeps track of whether this function uses sequences
> +  /// of pushes to pass function parameters.
> +  bool HasPushSequences;
>  
>  private:
>    /// ForwardedMustTailRegParms - A list of virtual and physical registers
> @@ -97,7 +100,8 @@
>                               VarArgsGPOffset(0),
>                               VarArgsFPOffset(0),
>                               ArgumentStackSize(0),
> -                             NumLocalDynamics(0) {}
> +                             NumLocalDynamics(0),
> +                             HasPushSequences(false) {}
>  
>    explicit X86MachineFunctionInfo(MachineFunction &MF)
>      : ForceFramePointer(false),
> @@ -113,11 +117,15 @@
>        VarArgsGPOffset(0),
>        VarArgsFPOffset(0),
>        ArgumentStackSize(0),
> -      NumLocalDynamics(0) {}
> +      NumLocalDynamics(0),
> +      HasPushSequences(false) {}
>  
>    bool getForceFramePointer() const { return ForceFramePointer;}
>    void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
>  
> +  bool getHasPushSequences() const { return HasPushSequences; }
> +  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
> +
>    bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
>    void setRestoreBasePointer(const MachineFunction *MF);
>    int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
> Index: lib/Target/X86/X86FrameLowering.h
> ===================================================================
> --- lib/Target/X86/X86FrameLowering.h
> +++ lib/Target/X86/X86FrameLowering.h
> @@ -64,6 +64,7 @@
>  
>    bool hasFP(const MachineFunction &MF) const override;
>    bool hasReservedCallFrame(const MachineFunction &MF) const override;
> +  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
>  
>    int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
>    int getFrameIndexReference(const MachineFunction &MF, int FI,
> Index: lib/Target/X86/X86.h
> ===================================================================
> --- lib/Target/X86/X86.h
> +++ lib/Target/X86/X86.h
> @@ -67,6 +67,10 @@
>  /// to eliminate execution delays in some Atom processors.
>  FunctionPass *createX86FixupLEAs();
>  
> +/// createX86ConvertMovsToPushes - Return a pass that converts movs
> +/// that stores function parameters onto the stack into pushes.
> +FunctionPass *createX86ConvertMovsToPushes();
> +
>  } // End llvm namespace
>  
>  #endif
> Index: lib/Target/X86/X86FrameLowering.cpp
> ===================================================================
> --- lib/Target/X86/X86FrameLowering.cpp
> +++ lib/Target/X86/X86FrameLowering.cpp
> @@ -38,7 +38,17 @@
>  extern cl::opt<bool> ForceStackAlign;
>  
>  bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
> -  return !MF.getFrameInfo()->hasVarSizedObjects();
> +  return !MF.getFrameInfo()->hasVarSizedObjects() &&
> +         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
> +}
> +
> +// We can simplify even we don't have a reserved call frame, in case 
> +// the only reason we don't have it is because we did the mov -> push
> +// transformation.
> +bool X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) 
> +  const {
> +  return hasReservedCallFrame(MF) || hasFP(MF) || 
> +    (!hasReservedCallFrame(MF) && !MF.getFrameInfo()->hasVarSizedObjects());
>  }
>  
>  /// hasFP - Return true if the specified function should have a dedicated frame
> @@ -93,16 +103,6 @@
>    return X86::AND32ri;
>  }
>  
> -static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) {
> -  // We don't support LP64 for now.
> -  assert(!IsLP64);
> -
> -  if (MO.isImm() && isInt<8>(MO.getImm()))
> -    return X86::PUSH32i8;
> -
> -  return X86::PUSHi32;;
> -}
> -
>  static unsigned getLEArOpcode(unsigned IsLP64) {
>    return IsLP64 ? X86::LEA64r : X86::LEA32r;
>  }
> @@ -1810,100 +1810,6 @@
>  #endif
>  }
>  
> -bool X86FrameLowering::
> -convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
> -                       MachineBasicBlock::iterator I, uint64_t Amount) const {
> -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
> -  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
> -    MF.getSubtarget().getRegisterInfo());
> -  unsigned StackPtr = RegInfo.getStackRegister();
> -
> -  // Scan the call setup sequence for the pattern we're looking for.
> -  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
> -  // instructions, that push a sequence of 32-bit values onto the stack, with
> -  // no gaps.  
> -  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
> -  do {
> -    int Opcode = I->getOpcode();
> -    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
> -      break;
> - 
> -    // We only want movs of the form:
> -    // movl imm/r32, k(%ecx)
> -    // If we run into something else, bail
> -    // Note that AddrBaseReg may, counterintuitively, not be a register...
> -    if (!I->getOperand(X86::AddrBaseReg).isReg() || 
> -        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
> -        !I->getOperand(X86::AddrScaleAmt).isImm() ||
> -        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
> -        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
> -        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
> -        !I->getOperand(X86::AddrDisp).isImm())
> -      return false;
> -
> -    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
> -    
> -    // We don't want to consider the unaligned case.
> -    if (StackDisp % 4)
> -      return false;
> -
> -    // If the same stack slot is being filled twice, something's fishy.
> -    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
> -      return false;
> -
> -    ++I;
> -  } while (I != MBB.end());
> -
> -  // We now expect the end of the sequence - a call and a stack adjust.
> -  if (I == MBB.end())
> -    return false;
> -  if (!I->isCall())
> -    return false;
> -  MachineBasicBlock::iterator Call = I;
> -  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
> -    return false;
> -
> -  // Now, go through the map, and see that we don't have any gaps,
> -  // but only a series of 32-bit MOVs.
> -  // Since std::map provides ordered iteration, the original order
> -  // of the MOVs doesn't matter.
> -  int64_t ExpectedDist = 0;
> -  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; 
> -       ++MMI, ExpectedDist += 4)
> -    if (MMI->first != ExpectedDist)
> -      return false;
> -
> -  // Ok, everything looks fine. Do the transformation.
> -  DebugLoc DL = I->getDebugLoc();
> -
> -  // It's possible the original stack adjustment amount was larger than
> -  // that done by the pushes. If so, we still need a SUB.
> -  Amount -= ExpectedDist;
> -  if (Amount) {
> -    MachineInstr* Sub = BuildMI(MBB, Call, DL,
> -                          TII.get(getSUBriOpcode(false, Amount)), StackPtr)
> -                  .addReg(StackPtr).addImm(Amount);
> -    Sub->getOperand(3).setIsDead();
> -  }
> -
> -  // Now, iterate through the map in reverse order, and replace the movs
> -  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
> -  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
> -    MachineBasicBlock::iterator MOV = MMI->second;
> -    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
> -
> -    // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size
> -    int PushOpcode = X86::PUSH32r;
> -    if (MOV->getOpcode() == X86::MOV32mi)
> -      PushOpcode = getPUSHiOpcode(false, PushOp);
> -
> -    BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp);
> -    MBB.erase(MOV);
> -  }
> -
> -  return true;
> -}
> -
>  void X86FrameLowering::
>  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
>                                MachineBasicBlock::iterator I) const {
> @@ -1918,7 +1824,7 @@
>    bool IsLP64 = STI.isTarget64BitLP64();
>    DebugLoc DL = I->getDebugLoc();
>    uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
> -  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
> +  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
>    I = MBB.erase(I);
>  
>    if (!reserveCallFrame) {
> @@ -1938,24 +1844,18 @@
>      Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
>  
>      MachineInstr *New = nullptr;
> -    if (Opcode == TII.getCallFrameSetupOpcode()) {
> -      // Try to convert movs to the stack into pushes.
> -      // We currently only look for a pattern that appears in 32-bit
> -      // calling conventions.
> -      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
> -        return;
>  
> -      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
> -                    StackPtr)
> -        .addReg(StackPtr)
> -        .addImm(Amount);
> -    } else {
> -      assert(Opcode == TII.getCallFrameDestroyOpcode());
> -
> -      // Factor out the amount the callee already popped.
> -      Amount -= CalleeAmt;
> +    // Factor out the amount that gets handled inside the sequence
> +    // (Pushes of argument for frame setup, callee pops for frame destroy)
> +    Amount -= InternalAmt;
> +
> +    if (Amount) {
> +      if (Opcode == TII.getCallFrameSetupOpcode()) {
> +        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
> +          .addReg(StackPtr).addImm(Amount);
> +      } else {
> +        assert(Opcode == TII.getCallFrameDestroyOpcode());
>  
> -      if (Amount) {
>          unsigned Opc = getADDriOpcode(IsLP64, Amount);
>          New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
>            .addReg(StackPtr).addImm(Amount);
> @@ -1973,13 +1873,13 @@
>      return;
>    }
>  
> -  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
> +  if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
>      // If we are performing frame pointer elimination and if the callee pops
>      // something off the stack pointer, add it back.  We do this until we have
>      // more advanced stack pointer tracking ability.
> -    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
> +    unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
>      MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
> -      .addReg(StackPtr).addImm(CalleeAmt);
> +      .addReg(StackPtr).addImm(InternalAmt);
>  
>      // The EFLAGS implicit def is dead.
>      New->getOperand(3).setIsDead();
> Index: lib/Target/X86/X86TargetMachine.cpp
> ===================================================================
> --- lib/Target/X86/X86TargetMachine.cpp
> +++ lib/Target/X86/X86TargetMachine.cpp
> @@ -154,6 +154,7 @@
>    void addIRPasses() override;
>    bool addInstSelector() override;
>    bool addILPOpts() override;
> +  void addPreRegAlloc() override;
>    void addPostRegAlloc() override;
>    void addPreEmitPass() override;
>  };
> @@ -187,6 +188,10 @@
>    return true;
>  }
>  
> +void X86PassConfig::addPreRegAlloc() {
> +  addPass(createX86ConvertMovsToPushes());
> +}
> +
>  void X86PassConfig::addPostRegAlloc() {
>    addPass(createX86FloatingPointStackifierPass());
>  }
> Index: lib/Target/X86/X86InstrCompiler.td
> ===================================================================
> --- lib/Target/X86/X86InstrCompiler.td
> +++ lib/Target/X86/X86InstrCompiler.td
> @@ -43,32 +43,36 @@
>  // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
>  // sub / add which can clobber EFLAGS.
>  let Defs = [ESP, EFLAGS], Uses = [ESP] in {
> -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
> +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
>                             "#ADJCALLSTACKDOWN",
> -                           [(X86callseq_start timm:$amt)]>,
> +                           []>,
>                            Requires<[NotLP64]>;
>  def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
>                             "#ADJCALLSTACKUP",
>                             [(X86callseq_end timm:$amt1, timm:$amt2)]>,
>                            Requires<[NotLP64]>;
>  }
> +def : Pat<(X86callseq_start timm:$amt1),
> +          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
> +
>  
>  // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
>  // a stack adjustment and the codegen must know that they may modify the stack
>  // pointer before prolog-epilog rewriting occurs.
>  // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
>  // sub / add which can clobber EFLAGS.
>  let Defs = [RSP, EFLAGS], Uses = [RSP] in {
> -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
> +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
>                             "#ADJCALLSTACKDOWN",
> -                           [(X86callseq_start timm:$amt)]>,
> +                           []>,
>                            Requires<[IsLP64]>;
>  def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
>                             "#ADJCALLSTACKUP",
>                             [(X86callseq_end timm:$amt1, timm:$amt2)]>,
>                            Requires<[IsLP64]>;
>  }
> -
> +def : Pat<(X86callseq_start timm:$amt1),
> +          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
>  
>  
>  // x86-64 va_start lowering magic.
> Index: lib/Target/X86/X86FastISel.cpp
> ===================================================================
> --- lib/Target/X86/X86FastISel.cpp
> +++ lib/Target/X86/X86FastISel.cpp
> @@ -2736,7 +2736,7 @@
>    // Issue CALLSEQ_START
>    unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
>    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
> -    .addImm(NumBytes);
> +    .addImm(NumBytes).addImm(0);
>  
>    // Walk the register/memloc assignments, inserting copies/loads.
>    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
> Index: lib/Target/X86/CMakeLists.txt
> ===================================================================
> --- lib/Target/X86/CMakeLists.txt
> +++ lib/Target/X86/CMakeLists.txt
> @@ -14,6 +14,7 @@
>  
>  set(sources
>    X86AsmPrinter.cpp
> +  X86ConvertMovsToPushes.cpp
>    X86FastISel.cpp
>    X86FloatingPoint.cpp
>    X86FrameLowering.cpp
> Index: lib/Target/X86/X86ConvertMovsToPushes.cpp
> ===================================================================
> --- lib/Target/X86/X86ConvertMovsToPushes.cpp
> +++ lib/Target/X86/X86ConvertMovsToPushes.cpp
> @@ -0,0 +1,294 @@
> +//===-------- X86ConvertMovsToPushes.cpp - pad short functions ------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file defines a pass that converts movs of function parameters onto the
> +// stack into pushes. This is beneficial for two main reasons
> +// 1) The push instruction encoding is much smaller than an esp-relative mov
> +// 2) It is possible to push memory arguments directly. So, if the
> +//    the transformation is preformed pre-reg-alloc, it can help relieve
> +//    register pressure.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#include <algorithm>
> +
> +#include "X86.h"
> +#include "X86InstrInfo.h"
> +#include "X86Subtarget.h"
> +#include "X86MachineFunctionInfo.h"
> +#include "llvm/ADT/Statistic.h"
> +#include "llvm/CodeGen/MachineFunctionPass.h"
> +#include "llvm/CodeGen/MachineInstrBuilder.h"
> +#include "llvm/CodeGen/MachineRegisterInfo.h"
> +#include "llvm/CodeGen/Passes.h"
> +#include "llvm/IR/Function.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/raw_ostream.h"
> +#include "llvm/Target/TargetInstrInfo.h"
> +
> +using namespace llvm;
> +
> +#define DEBUG_TYPE "x86-mov-to-push"
> +
> +cl::opt<bool> NoMovToPush("no-mov-to-push",
> +              cl::desc("Avoid function argument mov-to-push transformation"),
> +              cl::init(false), cl::Hidden);
> +
> +namespace {
> +class X86ConvertMovsToPushes : public MachineFunctionPass {
> +public:
> +  X86ConvertMovsToPushes() : MachineFunctionPass(ID) {}
> +
> +  bool runOnMachineFunction(MachineFunction &MF) override;
> +
> +private:
> +  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
> +                          MachineBasicBlock::iterator I);
> +
> +  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
> +                                   unsigned Reg);
> +
> +  const char *getPassName() const override {
> +    return "X86 Convert Movs to Pushes";
> +  }
> +
> +  const TargetInstrInfo *TII;
> +  const MachineRegisterInfo *MRI;
> +  static char ID;
> +};
> +
> +char X86ConvertMovsToPushes::ID = 0;
> +}
> +
> +FunctionPass *llvm::createX86ConvertMovsToPushes() {
> +  return new X86ConvertMovsToPushes();
> +}
> +
> +bool X86ConvertMovsToPushes::runOnMachineFunction(MachineFunction &MF) {
> +  if (NoMovToPush.getValue())
> +    return false;
> +
> +  // We currently only support call sequences where *all* parameters.
> +  // are passed on the stack.
> +  // No point in running this in 64-bit mode, since some arguments are
> +  // passed in-register in all common calling conventions, so the pattern
> +  //  we're looking for will never match.
> +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
> +  if (STI.is64Bit())
> +    return false;
> +
> +  // This transformation is always a win when optimizing for size,
> +  // or when we are not going to have a reserved call stack.
> +  // Under other circumstances, it may be either a win or a loss,
> +  // and requires a heuristic.
> +  // For now, enable it only for the clear win cases.
> +
> +  // TODO: Add a heuristic that actually looks at the function,
> +  //       and enable this for more cases.
> +
> +  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
> +  bool OptForSize =
> +      FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
> +                           Attribute::OptimizeForSize) ||
> +      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
> +
> +  if (!MF.getFrameInfo()->hasVarSizedObjects() && !OptForSize)
> +    return false;
> +
> +  TII = MF.getSubtarget().getInstrInfo();
> +  MRI = &MF.getRegInfo();
> +  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
> +
> +  bool Changed = false;
> +
> +  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
> +    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
> +      if (I->getOpcode() == FrameSetupOpcode)
> +        Changed |= adjustCallSequence(MF, *BB, I);
> +
> +  return Changed;
> +}
> +
> +bool X86ConvertMovsToPushes::adjustCallSequence(MachineFunction &MF,
> +                                                MachineBasicBlock &MBB,
> +                                                MachineBasicBlock::iterator I) {
> +
> +  // Check that this particular call sequence is amenable to the
> +  // transformation.
> +  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
> +                                       MF.getSubtarget().getRegisterInfo());
> +  unsigned StackPtr = RegInfo.getStackRegister();
> +  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
> +
> +  // We expect to enter this at the beginning of a call sequence
> +  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
> +  MachineBasicBlock::iterator FrameSetup = I++;
> +
> +  // We expect a copy instruction here.
> +  // TODO: The copy instruction here is a lowering artifact.
> +  //       We should also support a copy-less version, where the stack
> +  //       pointer is used directly.
> +  if (!I->isCopy() || !I->getOperand(0).isReg())
> +    return false;
> +  MachineBasicBlock::iterator SPCopy = I++;
> +  StackPtr = SPCopy->getOperand(0).getReg();
> +
> +  // Scan the call setup sequence for the pattern we're looking for.
> +  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
> +  // instructions, that push a sequence of 32-bit values onto the stack, with
> +  // no gaps between them.
> +  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
> +
> +  do {
> +    int Opcode = I->getOpcode();
> +    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
> +      break;
> +
> +    // We only want movs of the form:
> +    // movl imm/r32, k(%esp)
> +    // If we run into something else, bail.
> +    // Note that AddrBaseReg may, counter to its name, not be a register,
> +    // but rather a frame index.
> +    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
> +        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
> +        !I->getOperand(X86::AddrScaleAmt).isImm() ||
> +        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
> +        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
> +        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
> +        !I->getOperand(X86::AddrDisp).isImm())
> +      return false;
> +
> +    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
> +
> +    // We really don't want to consider the unaligned case.
> +    if (StackDisp % 4)
> +      return false;
> +
> +    // If the same stack slot is being filled twice, something's fishy.
> +    if (!MovMap.insert(std::pair<int64_t, MachineInstr *>(StackDisp, I)).second)
> +      return false;
> +
> +    ++I;
> +  } while (I != MBB.end());
> +
> +  // We now expect the end of the sequence - a call and a stack adjust.
> +  if (I == MBB.end())
> +    return false;
> +  if (!I->isCall())
> +    return false;
> +  MachineBasicBlock::iterator Call = I;
> +  if ((++I)->getOpcode() != FrameDestroyOpcode)
> +    return false;
> +
> +  // Now, go through the map, and see that we don't have any gaps,
> +  // but only a series of 32-bit MOVs.
> +  // Since std::map provides ordered iteration, the original order
> +  // of the MOVs doesn't matter.
> +  int64_t ExpectedDist = 0;
> +  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME;
> +       ++MMI, ExpectedDist += 4)
> +    if (MMI->first != ExpectedDist)
> +      return false;
> +
> +  // Ok, we can in fact do the transformation for this call.
> +  // Do not remove the FrameSetup instruction, but adjust the size.
> +  // PEI will end up finalizing the handling of that.
> +  FrameSetup->getOperand(1).setImm(ExpectedDist);
> +
> +  DebugLoc DL = I->getDebugLoc();
> +  // Now, iterate through the map in reverse order, and replace the movs
> +  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
> +  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
> +    MachineBasicBlock::iterator MOV = MMI->second;
> +    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
> +    if (MOV->getOpcode() == X86::MOV32mi) {
> +      unsigned PushOpcode = X86::PUSHi32;
> +      if (PushOp.isImm()) {
> +        int64_t Val = PushOp.getImm();
> +        if (isInt<8>(Val))
> +          PushOpcode = X86::PUSH32i8;
> +      }
> +      BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
> +    } else {
> +      unsigned int Reg = PushOp.getReg();
> +
> +      // If PUSHrmm is not slow on this target, try to fold the source of the
> +      // push into the instruction.
> +      const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
> +      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
> +      MachineInstr *DefMov = nullptr;
> +      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
> +        MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
> +
> +        unsigned NumOps = DefMov->getDesc().getNumOperands();
> +        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
> +          Push->addOperand(DefMov->getOperand(i));
> +
> +        DefMov->eraseFromParent();
> +      } else {
> +        BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
> +      }
> +    }
> +
> +    MBB.erase(MOV);
> +  }
> +
> +  // The stack-pointer copy is no longer used in the call sequences.
> +  // There should not be any other users, but we can't commit to that, so:
> +  if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
> +    SPCopy->eraseFromParent();
> +
> +  // Once we've done this, we need to make sure PEI doesn't assume a reserved
> +  // frame.
> +  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
> +  FuncInfo->setHasPushSequences(true);
> +
> +  return true;
> +}
> +
> +MachineInstr *X86ConvertMovsToPushes::canFoldIntoRegPush(
> +    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
> +  // Do an extremely restricted form of load folding.
> +  // ISel will often create patterns like:
> +  // movl    4(%edi), %eax
> +  // movl    8(%edi), %ecx
> +  // movl    12(%edi), %edx
> +  // movl    %edx, 8(%esp)
> +  // movl    %ecx, 4(%esp)
> +  // movl    %eax, (%esp)
> +  // call
> +  // Get rid of those with prejudice.
> +  if (!TargetRegisterInfo::isVirtualRegister(Reg))
> +    return nullptr;
> +
> +  // Make sure this is the only use of Reg.
> +  if (!MRI->hasOneNonDBGUse(Reg))
> +    return nullptr;
> +
> +  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
> +
> +  // Make sure the def is a MOV from memory.
> +  // If the def is an another block, give up.
> +  if (DefMI->getOpcode() != X86::MOV32rm ||
> +      DefMI->getParent() != FrameSetup->getParent())
> +    return nullptr;
> +
> +  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
> +  // of MOVs.
> +  for (auto I = DefMI; I != FrameSetup; ++I)
> +    if (I->mayStore())
> +      return nullptr;
> +
> +  // Be careful with movs that load from a stack slot, since it may get
> +  // resolved incorrectly.
> +  if (!DefMI->getOperand(1).isReg())
> +    return nullptr;
> +
> +  return DefMI;
> +}

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits