[llvm-commits] [Review request][Win64] Use pushq/popq GPRs in function prologue/epilogue

NAKAMURA Takumi geek4civic at gmail.com
Mon Feb 7 03:49:53 PST 2011


Anton,

I attached an updated patch, refined and comments added, thank you.
It is at; https://github.com/chapuni/LLVM/commit/8da7419f8170e5a11063eb8e992ecb1ad40f9f6a


On the github, Anton wrote;
> View Commit: https://github.com/chapuni/LLVM/commit/d719f1b1ba5f01823e00904c1599adf356089a1d

> This will pessimize non-win64 targets. Basically, the problem is that you cannot use pushq/popq for high xmm regs, which are callee-saved on win64-only (but not on linux / darwin, for example).

I thought the patch does not affect to non-win64 targets, but it might
be dubious to check whether regs are GPR or not.
I rewrote checking expressions.

I am sorry if I missed your point.

> Another problem is function prologue emission. Have you verified that you always have proper stack frame? Even in case when high xmm regs (xmm5, etc.) are spilled?

As far as spiller emits in order [pushq GPRs...] and [mov xmm to fp],
and restorer emits in order [mov xmm from fp] and [popq GPRs],
emitPrologue() and emitEpilogue() will emit adjusting %rsp onto proper place.
And I can expect spiller can place XMMs i128 aligned.

; for example
define void @foo() nounwind {
entry:
  tail call void (...)* @bar() nounwind
  tail call void asm sideeffect "nop",
"~{si},~{di},~{xmm13},~{xmm11},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
nounwind
  ret void
}
declare void @bar(...)

#### -mtriple=x86_64-mingw32
foo:
        pushq   %rsi
        pushq   %rdi
        subq    $88, %rsp
        movaps  %xmm15, 32(%rsp)        # 16-byte Spill
        movaps  %xmm13, 48(%rsp)        # 16-byte Spill
        movaps  %xmm11, 64(%rsp)        # 16-byte Spill
        callq   bar
        #APP
        nop
        #NO_APP
        movaps  64(%rsp), %xmm11        # 16-byte Reload
        movaps  48(%rsp), %xmm13        # 16-byte Reload
        movaps  32(%rsp), %xmm15        # 16-byte Reload
        addq    $88, %rsp
        popq    %rdi
        popq    %rsi
        ret

And also, I have checked to build clang by 3 stage.
(x64-clang can build and test clang and llvm)


...Takumi
-------------- next part --------------
From 8da7419f8170e5a11063eb8e992ecb1ad40f9f6a Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Thu, 20 Jan 2011 13:24:27 +0900
Subject: [PATCH] lib/Target/X86/X86FrameLowering.cpp: On Win64, pushq/popq GPR can be used instead of n(%rsp) and n(%rbp).

---
 lib/Target/X86/X86FrameLowering.cpp |   56 ++++++++++++++++++++++++----------
 test/CodeGen/X86/tailcallstack64.ll |    2 +-
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 0a3f931..3246c43 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -892,7 +892,6 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
 
-  bool isWin64 = STI.isTargetWin64();
   unsigned SlotSize = STI.is64Bit() ? 8 : 4;
   unsigned FPReg = TRI->getFrameRegister(MF);
   unsigned CalleeFrameSize = 0;
@@ -900,25 +899,39 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
+  // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     if (Reg == FPReg)
       // X86RegisterInfo::emitPrologue will handle spilling of frame register.
       continue;
-    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
-      CalleeFrameSize += SlotSize;
-      BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill);
-    } else {
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
-                              RC, TRI);
-    }
+    CalleeFrameSize += SlotSize;
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill);
   }
 
   X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
+
+  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+  // It can be done by spilling XMMs to stack frame.
+  // Note that only Win64 ABI might spill XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
+                            RC, TRI);
+  }
+
   return true;
 }
 
@@ -933,21 +946,30 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // Reload XMMs from stack frame.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+                             RC, TRI);
+  }
+
+  // POP GPRs.
   unsigned FPReg = TRI->getFrameRegister(MF);
-  bool isWin64 = STI.isTargetWin64();
   unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
     if (Reg == FPReg)
       // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
       continue;
-    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
-      BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
-    } else {
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
-                               RC, TRI);
-    }
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
   }
   return true;
 }
diff --git a/test/CodeGen/X86/tailcallstack64.ll b/test/CodeGen/X86/tailcallstack64.ll
index 0c732d5..0927779 100644
--- a/test/CodeGen/X86/tailcallstack64.ll
+++ b/test/CodeGen/X86/tailcallstack64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -tailcallopt -mtriple=x86_64-win32 -post-RA-scheduler=true | FileCheck %s
 
 ; FIXME: Redundant unused stack allocation could be eliminated.
-; CHECK: subq  ${{24|88}}, %rsp
+; CHECK: subq  ${{24|72}}, %rsp
 
 ; Check that lowered arguments on the stack do not overwrite each other.
 ; Add %in1 %p1 to a different temporary register (%eax).
-- 
1.7.1.GIT



More information about the llvm-commits mailing list