[llvm] r268227 - Enable the X86 call frame optimization for the 64-bit targets that allow it.
David L Kreitzer via llvm-commits
llvm-commits at lists.llvm.org
Mon May 2 06:45:25 PDT 2016
Author: dlkreitz
Date: Mon May 2 08:45:25 2016
New Revision: 268227
URL: http://llvm.org/viewvc/llvm-project?rev=268227&view=rev
Log:
Enable the X86 call frame optimization for the 64-bit targets that allow it.
Fixes PR27241.
Differential Revision: http://reviews.llvm.org/D19688
Added:
llvm/trunk/test/CodeGen/X86/movtopush64.ll
Modified:
llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
Modified: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp?rev=268227&r1=268226&r2=268227&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp Mon May 2 08:45:25 2016
@@ -105,7 +105,7 @@ private:
const TargetInstrInfo *TII;
const X86FrameLowering *TFL;
const X86Subtarget *STI;
- const MachineRegisterInfo *MRI;
+ MachineRegisterInfo *MRI;
unsigned SlotSize;
unsigned Log2SlotSize;
static char ID;
@@ -125,14 +125,6 @@ bool X86CallFrameOptimization::isLegal(M
if (NoX86CFOpt.getValue())
return false;
- // We currently only support call sequences where *all* parameters.
- // are passed on the stack.
- // No point in running this in 64-bit mode, since some arguments are
- // passed in-register in all common calling conventions, so the pattern
- // we're looking for will never match.
- if (STI->is64Bit())
- return false;
-
// We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
// in the compact unwind encoding that Darwin uses. So, bail if there
// is a danger of that being generated.
@@ -141,6 +133,11 @@ bool X86CallFrameOptimization::isLegal(M
(MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
return false;
+ // It is not valid to change the stack pointer outside the prolog/epilog
+ // on 64-bit Windows.
+ if (STI->isTargetWin64())
+ return false;
+
// You would expect straight-line code between call-frame setup and
// call-frame destroy. You would be wrong. There are circumstances (e.g.
// CMOV_GR8 expansion of a select that feeds a function call!) where we can
@@ -204,7 +201,7 @@ bool X86CallFrameOptimization::isProfita
// We can use pushes. First, account for the fixed costs.
// We'll need a add after the call.
Advantage -= 3;
- // If we have to realign the stack, we'll also need and sub before
+ // If we have to realign the stack, we'll also need a sub before
if (CC.ExpectedDist % StackAlign)
Advantage -= 3;
// Now, for each push, we save ~3 bytes. For small constants, we actually,
@@ -264,7 +261,8 @@ X86CallFrameOptimization::classifyInstru
// The instructions we actually care about are movs onto the stack
int Opcode = MI->getOpcode();
- if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
+ if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr ||
+ Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr)
return Convert;
// Not all calling conventions have only stack MOVs between the stack
@@ -457,6 +455,7 @@ bool X86CallFrameOptimization::adjustCal
FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
DebugLoc DL = FrameSetup->getDebugLoc();
+ bool Is64Bit = STI->is64Bit();
// Now, iterate through the vector in reverse order, and replace the movs
// with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
// replace uses.
@@ -469,7 +468,8 @@ bool X86CallFrameOptimization::adjustCal
default:
llvm_unreachable("Unexpected Opcode!");
case X86::MOV32mi:
- PushOpcode = X86::PUSHi32;
+ case X86::MOV64mi32:
+ PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32;
// If the operand is a small (8-bit) immediate, we can use a
// PUSH instruction with a shorter encoding.
// Note that isImm() may fail even though this is a MOVmi, because
@@ -477,14 +477,27 @@ bool X86CallFrameOptimization::adjustCal
if (PushOp.isImm()) {
int64_t Val = PushOp.getImm();
if (isInt<8>(Val))
- PushOpcode = X86::PUSH32i8;
+ PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
}
Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
.addOperand(PushOp);
break;
case X86::MOV32mr:
+ case X86::MOV64mr:
unsigned int Reg = PushOp.getReg();
+ // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
+ // in preparation for the PUSH64. The upper 32 bits can be undef.
+ if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) {
+ unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
+ BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
+ .addReg(UndefReg)
+ .addOperand(PushOp)
+ .addImm(X86::sub_32bit);
+ }
+
// If PUSHrmm is not slow on this target, try to fold the source of the
// push into the instruction.
bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
@@ -493,7 +506,7 @@ bool X86CallFrameOptimization::adjustCal
// conservative about that.
MachineInstr *DefMov = nullptr;
if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
- PushOpcode = X86::PUSH32rmm;
+ PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));
unsigned NumOps = DefMov->getDesc().getNumOperands();
@@ -502,7 +515,7 @@ bool X86CallFrameOptimization::adjustCal
DefMov->eraseFromParent();
} else {
- PushOpcode = X86::PUSH32r;
+ PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
.addReg(Reg)
.getInstr();
@@ -557,7 +570,8 @@ MachineInstr *X86CallFrameOptimization::
// Make sure the def is a MOV from memory.
// If the def is an another block, give up.
- if (DefMI->getOpcode() != X86::MOV32rm ||
+ if ((DefMI->getOpcode() != X86::MOV32rm &&
+ DefMI->getOpcode() != X86::MOV64rm) ||
DefMI->getParent() != FrameSetup->getParent())
return nullptr;
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=268227&r1=268226&r2=268227&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon May 2 08:45:25 2016
@@ -2141,6 +2141,12 @@ int X86InstrInfo::getSPAdjust(const Mach
case X86::PUSH32rmr:
case X86::PUSHi32:
return 4;
+ case X86::PUSH64i8:
+ case X86::PUSH64r:
+ case X86::PUSH64rmm:
+ case X86::PUSH64rmr:
+ case X86::PUSH64i32:
+ return 8;
}
}
Added: llvm/trunk/test/CodeGen/X86/movtopush64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movtopush64.ll?rev=268227&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/movtopush64.ll (added)
+++ llvm/trunk/test/CodeGen/X86/movtopush64.ll Mon May 2 08:45:25 2016
@@ -0,0 +1,193 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=NORMAL -check-prefix=NORMALFP
+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=NOPUSH
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=NOPUSH -check-prefix=NORMALFP
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
+
+declare void @seven_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g)
+declare void @ten_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i64 %h, i32 %i, i64 %j)
+declare void @ten_params_ptr(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i8* %h, i32 %i, i64 %j)
+declare void @cannot_push(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i)
+
+; We should get pushes for the last 4 parameters. Test that the
+; in-register parameters are all in the right places, and check
+; that the stack manipulations are correct and correctly
+; described by the DWARF directives. Test that the switch
+; to disable the optimization works and that the optimization
+; doesn't kick in on Windows64 where it is not allowed.
+; NORMAL-LABEL: test1
+; NORMAL: pushq
+; NORMAL-DAG: movl $1, %edi
+; NORMAL-DAG: movl $2, %esi
+; NORMAL-DAG: movl $3, %edx
+; NORMAL-DAG: movl $4, %ecx
+; NORMAL-DAG: movl $5, %r8d
+; NORMAL-DAG: movl $6, %r9d
+; NORMAL: pushq $10
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $9
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $8
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $7
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: callq ten_params
+; NORMAL: addq $32, %rsp
+; NORMAL: .cfi_adjust_cfa_offset -32
+; NORMAL: popq
+; NORMAL: retq
+; NOPUSH-LABEL: test1
+; NOPUSH-NOT: pushq
+; NOPUSH: retq
+define void @test1() {
+entry:
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ ret void
+}
+
+; The presence of a frame pointer should not prevent pushes. But we
+; don't need the CFI directives in that case.
+; Also check that we generate the right pushes for >8bit immediates.
+; NORMALFP-LABEL: test2
+; NORMALFP: pushq $10000
+; NORMALFP-NEXT: pushq $9000
+; NORMALFP-NEXT: pushq $8000
+; NORMALFP-NEXT: pushq $7000
+; NORMALFP-NEXT: callq {{_?}}ten_params
+define void @test2(i32 %k) {
+entry:
+ %a = alloca i32, i32 %k
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7000, i64 8000, i32 9000, i64 10000)
+ ret void
+}
+
+; Parameters 7 & 8 should push a 64-bit register.
+; TODO: Note that the regular expressions disallow r8 and r9. That's fine for
+; now, because the pushes will always follow the moves into r8 and r9.
+; Eventually, though, we want to be able to schedule the pushes better.
+; In this example, it will save two copies, because we have to move the
+; incoming parameters out of %rdi and %rsi to make room for the outgoing
+; parameters.
+; NORMAL-LABEL: test3
+; NORMAL: pushq $10000
+; NORMAL: pushq $9000
+; NORMAL: pushq %r{{..}}
+; NORMAL: pushq %r{{..}}
+; NORMAL: callq ten_params
+define void @test3(i32 %a, i64 %b) {
+entry:
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %a, i64 %b, i32 9000, i64 10000)
+ ret void
+}
+
+; Check that we avoid the optimization for just one push.
+; NORMAL-LABEL: test4
+; NORMAL: movl $7, (%rsp)
+; NORMAL: callq seven_params
+define void @test4() {
+entry:
+ call void @seven_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7)
+ ret void
+}
+
+; Check that pushing link-time constant addresses works correctly
+; NORMAL-LABEL: test5
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq $ext
+; NORMAL: pushq $7
+; NORMAL: callq ten_params_ptr
+ at ext = external constant i8
+define void @test5() {
+entry:
+ call void @ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i8* @ext, i32 9, i64 10)
+ ret void
+}
+
+; Check that we fuse 64-bit loads but not 32-bit loads into PUSH mem.
+; NORMAL-LABEL: test6
+; NORMAL: movq %rsi, [[REG64:%.+]]
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq ([[REG64]])
+; NORMAL: pushq {{%r..}}
+; NORMAL: callq ten_params
+define void @test6(i32* %p32, i64* %p64) {
+entry:
+ %v32 = load i32, i32* %p32
+ %v64 = load i64, i64* %p64
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %v32, i64 %v64, i32 9, i64 10)
+ ret void
+}
+
+; Fold stack-relative loads into the push with correct offsets.
+; Do the same for an indirect call whose address is loaded from the stack.
+; On entry, %p7 is at 8(%rsp) and %p8 is at 16(%rsp). Prior to the call
+; sequence, 72 bytes are allocated to the stack, 48 for register saves and
+; 24 for local storage and alignment, so %p7 is at 80(%rsp) and %p8 is at
+; 88(%rsp). The call address can be stored anywhere in the local space but
+; happens to be stored at 8(%rsp). Each push bumps these offsets up by
+; 8 bytes.
+; NORMAL-LABEL: test7
+; NORMAL: movq %r{{.*}}, 8(%rsp) {{.*Spill$}}
+; NORMAL: pushq 88(%rsp)
+; NORMAL: pushq $9
+; NORMAL: pushq 96(%rsp)
+; NORMAL: pushq $7
+; NORMAL: callq *40(%rsp)
+define void @test7(i64 %p1, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6, i64 %p7, i64 %p8) {
+entry:
+ %stack_fptr = alloca void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*
+ store void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)* @ten_params, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr
+ %ten_params_ptr = load volatile void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr
+ call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ call void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64) %ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %p7, i32 9, i64 %p8)
+ ret void
+}
+
+; We can't fold the load from the global into the push because of
+; interference from the store
+; NORMAL-LABEL: test8
+; NORMAL: movq the_global(%rip), [[REG:%r.+]]
+; NORMAL: movq $42, the_global
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq [[REG]]
+; NORMAL: pushq $7
+; NORMAL: callq ten_params
+ at the_global = external global i64
+define void @test8() {
+ %myload = load i64, i64* @the_global
+ store i64 42, i64* @the_global
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %myload, i32 9, i64 10)
+ ret void
+}
+
+
+; Converting one function call to use pushes negatively affects
+; other calls that pass arguments on the stack without pushes.
+; If the cost outweighs the benefit, avoid using pushes.
+; NORMAL-LABEL: test9
+; NORMAL: callq cannot_push
+; NORMAL-NOT: push
+; NORMAL: callq ten_params
+define void @test9(float %p1) {
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ ret void
+}
+
+; But if the benefit outweighs the cost, use pushes.
+; NORMAL-LABEL: test10
+; NORMAL: callq cannot_push
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq $8
+; NORMAL: pushq $7
+; NORMAL: callq ten_params
+define void @test10(float %p1) {
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ ret void
+}
More information about the llvm-commits
mailing list