[llvm] r298116 - [X86] Emit fewer instructions to allocate >16GB stack frames

Fri Mar 17 13:25:49 PDT 2017

Author: rnk
Date: Fri Mar 17 15:25:49 2017
New Revision: 298116

URL: http://llvm.org/viewvc/llvm-project?rev=298116&view=rev
Log:
[X86] Emit fewer instructions to allocate >16GB stack frames

Summary:
Use this code pattern when RAX is live, instead of emitting up to 2
billion adjustments:
  pushq %rax
  movabsq +-$Offset+-8, %rax
  addq %rsp, %rax
  xchg %rax, (%rsp)
  movq (%rsp), %rsp

Try to clean this code up a bit while I'm here. In particular, hoist the
logic that handles the entire adjustment with `movabsq $imm, %rax` out
of the loop.

This negates the offset in the prologue and uses ADD because X86 only
has a two operand subtract which always subtracts from the destination
register, which can no longer be RSP.

Fixes PR31962

Reviewers: majnemer, sdardis

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30052

Added:
    llvm/trunk/test/CodeGen/X86/huge-stack-offset2.ll
Modified:
    llvm/trunk/lib/Target/X86/X86FrameLowering.cpp

Modified: llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FrameLowering.cpp?rev=298116&r1=298115&r2=298116&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86FrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.cpp Fri Mar 17 15:25:49 2017
@@ -252,40 +252,76 @@ void X86FrameLowering::emitSPUpdate(Mach
                                     int64_t NumBytes, bool InEpilogue) const {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  MachineInstr::MIFlag Flag =
+      isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-  while (Offset) {
-    if (Offset > Chunk) {
-      // Rather than emit a long series of instructions for large offsets,
-      // load the offset into a register and do one sub/add
-      unsigned Reg = 0;
+  if (Offset > Chunk) {
+    // Rather than emit a long series of instructions for large offsets,
+    // load the offset into a register and do one sub/add
+    unsigned Reg = 0;
+    unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
 
-      if (isSub && !isEAXLiveIn(MBB))
-        Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
-      else
-        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+    if (isSub && !isEAXLiveIn(MBB))
+      Reg = Rax;
+    else
+      Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
 
-      if (Reg) {
-        unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
-        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
-          .addImm(Offset);
-        Opc = isSub
-          ? getSUBrrOpcode(Is64Bit)
-          : getADDrrOpcode(Is64Bit);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
-          .addReg(StackPtr)
-          .addReg(Reg);
-        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
-        Offset = 0;
-        continue;
-      }
+    unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+    unsigned AddSubRROpc =
+        isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
+    if (Reg) {
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
+                             .addReg(StackPtr)
+                             .addReg(Reg);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      return;
+    } else if (Offset > 8 * Chunk) {
+      // If we would need more than 8 add or sub instructions (a >16GB stack
+      // frame), it's worth spilling RAX to materialize this immediate.
+      //   pushq %rax
+      //   movabsq +-$Offset+-SlotSize, %rax
+      //   addq %rsp, %rax
+      //   xchg %rax, (%rsp)
+      //   movq (%rsp), %rsp
+      assert(Is64Bit && "can't have 32-bit 16GB stack frame");
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+          .addReg(Rax, RegState::Kill)
+          .setMIFlag(Flag);
+      // Subtract is not commutative, so negate the offset and always use add.
+      // Subtract 8 less and add 8 more to account for the PUSH we just did.
+      if (isSub)
+        Offset = -(Offset - SlotSize);
+      else
+        Offset = Offset + SlotSize;
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
+                             .addReg(Rax)
+                             .addReg(StackPtr);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      // Exchange the new SP in RAX with the top of the stack.
+      addRegOffset(
+          BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
+          StackPtr, false, 0);
+      // Load new SP from the top of the stack into RSP.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
+                   StackPtr, false, 0);
+      return;
     }
+  }
 
+  while (Offset) {
     uint64_t ThisVal = std::min(Offset, Chunk);
-    if (ThisVal == (Is64Bit ? 8 : 4)) {
-      // Use push / pop instead.
+    if (ThisVal == SlotSize) {
+      // Use push / pop for slot sized adjustments as a size optimization. We
+      // need to find a dead register when using pop.
       unsigned Reg = isSub
         ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
         : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
@@ -293,23 +329,16 @@ void X86FrameLowering::emitSPUpdate(Mach
         unsigned Opc = isSub
           ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
           : (Is64Bit ? X86::POP64r  : X86::POP32r);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
-          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
-        if (isSub)
-          MI->setFlag(MachineInstr::FrameSetup);
-        else
-          MI->setFlag(MachineInstr::FrameDestroy);
+        BuildMI(MBB, MBBI, DL, TII.get(Opc))
+            .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
+            .setMIFlag(Flag);
         Offset -= ThisVal;
         continue;
       }
     }
 
-    MachineInstrBuilder MI = BuildStackAdjustment(
-        MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
-    if (isSub)
-      MI.setMIFlag(MachineInstr::FrameSetup);
-    else
-      MI.setMIFlag(MachineInstr::FrameDestroy);
+    BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
+        .setMIFlag(Flag);
 
     Offset -= ThisVal;
   }

Added: llvm/trunk/test/CodeGen/X86/huge-stack-offset2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/huge-stack-offset2.ll?rev=298116&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/huge-stack-offset2.ll (added)
+++ llvm/trunk/test/CodeGen/X86/huge-stack-offset2.ll Fri Mar 17 15:25:49 2017
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK
+
+; Test how we handle pathologically large stack frames when RAX is live through
+; the prologue and epilogue.
+
+declare void @bar(i8*)
+declare void @llvm.va_start(i8*)
+
+; For stack frames between 2GB and 16GB, do multiple adjustments.
+
+define i32 @stack_frame_8gb(i32 %x, ...) nounwind {
+; CHECK-LABEL: stack_frame_8gb:
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp
+; CHECK:      callq bar
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp
+; CHECK:      retq
+  %1 = alloca [u0x200000000 x i8]
+  %va = alloca i8, i32 24
+  call void @llvm.va_start(i8* %va)
+  %2 = getelementptr inbounds [u0x200000000 x i8], [u0x200000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 %x
+}
+
+; For stack frames larger than 16GB, spill EAX instead of doing a linear number
+; of adjustments.
+
+; This function should have a frame size of 0x4000000D0. The 0xD0 is 208 bytes
+; from 24 bytes of va_list, 176 bytes of spilled varargs regparms, and 8 bytes
+; of alignment. We subtract 8 less and add 8 more in the prologue and epilogue
+; respectively to account for the PUSH.
+
+define i32 @stack_frame_16gb(i32 %x, ...) nounwind {
+; CHECK-LABEL: stack_frame_16gb:
+; CHECK:      pushq %rax
+; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0xFFFFFFFBFFFFFF38
+; CHECK-NEXT: addq %rsp, %rax
+; CHECK-NEXT: xchgq %rax, (%rsp)
+; CHECK-NEXT: movq (%rsp), %rsp
+; CHECK:      callq bar
+; CHECK:      pushq %rax
+; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0x4000000D8
+; CHECK-NEXT: addq %rsp, %rax
+; CHECK-NEXT: xchgq %rax, (%rsp)
+; CHECK-NEXT: movq (%rsp), %rsp
+; CHECK:      retq
+  %1 = alloca [u0x400000000 x i8]
+  %va = alloca i8, i32 24
+  call void @llvm.va_start(i8* %va)
+  %2 = getelementptr inbounds [u0x400000000 x i8], [u0x400000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 %x
+}
+