[PATCH] Ensure stack is properly aligned for call argument initialization

Wed Aug 5 03:07:17 PDT 2015

Arguments spilled on the stack before a function call may have
alignment requirements, for example in the case of vectors.
These requirements are exploited by the code generator by using
move instructions that have similar alignment requirements, e.g.,
movaps on x86.

Although the code generator properly aligns the arguments with
respect to the displacement of the stack pointer it computes,
the displacement itself may cause misalignment. For example if
we have

   %3 = load <16 x float>, <16 x float>* %1, align 64
   call void @bar(<16 x float> %3, i32 0)

The x86 back-end emits:

   movaps  32(%ecx), %xmm2
   movaps  (%ecx), %xmm0
   movaps  16(%ecx), %xmm1
   movaps  48(%ecx), %xmm3
   subl    $20, %esp       <-- if %esp was 16-byte aligned before this
                               instruction, it no longer will be afterwards
   movaps  %xmm3, (%esp)   <-- movaps requires 16-byte alignment, while %esp
                               is not aligned as such.
   movl    $0, 16(%esp)
   calll   __bar

To solve this, we need to make sure that the computed value with which
the stack pointer is changed is a multiple af the maximal alignment seen
during its computation. With this change we get proper alignment:

   subl    $32, %esp
   movaps  %xmm3, (%esp)
---
  include/llvm/CodeGen/CallingConvLower.h |  6 +++++-
  lib/CodeGen/CallingConvLower.cpp        |  3 +++
  test/CodeGen/X86/aligned-variadic.ll    |  2 +-
  test/CodeGen/X86/win32-spill-xmm.ll     | 14 ++++++++++++++
  4 files changed, 23 insertions(+), 2 deletions(-)
  create mode 100644 test/CodeGen/X86/win32-spill-xmm.ll

diff --git a/include/llvm/CodeGen/CallingConvLower.h 
b/include/llvm/CodeGen/CallingConvLower.h
index 1fd4eeb..79f83ba 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -201,6 +201,7 @@ private:
    LLVMContext &Context;
     unsigned StackOffset;
+  unsigned MinStackAlign;
    SmallVector<uint32_t, 16> UsedRegs;
    SmallVector<CCValAssign, 4> PendingLocs;
  @@ -270,7 +271,9 @@ public:
    CallingConv::ID getCallingConv() const { return CallingConv; }
    bool isVarArg() const { return IsVarArg; }
  -  unsigned getNextStackOffset() const { return StackOffset; }
+  unsigned getNextStackOffset() const {
+    return ((StackOffset + MinStackAlign - 1) & ~(MinStackAlign - 1));
+  }
     /// isAllocated - Return true if the specified register (or an 
alias) is
    /// allocated.
@@ -403,6 +406,7 @@ public:
      StackOffset = ((StackOffset + Align - 1) & ~(Align - 1));
      unsigned Result = StackOffset;
      StackOffset += Size;
+    MinStackAlign = Align > MinStackAlign ? Align : MinStackAlign;
      MF.getFrameInfo()->ensureMaxAlignment(Align);
      return Result;
    }
diff --git a/lib/CodeGen/CallingConvLower.cpp 
b/lib/CodeGen/CallingConvLower.cpp
index fb29b1d..559e076 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -32,6 +32,7 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, 
MachineFunction &mf,
        CallOrPrologue(Unknown) {
    // No stack is used.
    StackOffset = 0;
+  MinStackAlign = 1;
     clearByValRegsInfo();
    UsedRegs.resize((TRI.getNumRegs()+31)/32);
@@ -192,6 +193,7 @@ static bool isValueTypeInRegForCC(CallingConv::ID 
CC, MVT VT) {
  void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> 
&Regs,
                                            MVT VT, CCAssignFn Fn) {
    unsigned SavedStackOffset = StackOffset;
+  unsigned SavedMinStackAlign = MinStackAlign;
    unsigned NumLocs = Locs.size();
     // Set the 'inreg' flag if it is used for this calling convention.
@@ -223,6 +225,7 @@ void 
CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
    // as allocated so that future queries don't return the same 
registers, i.e.
    // when i64 and f64 are both passed in GPRs.
    StackOffset = SavedStackOffset;
+  MinStackAlign = SavedMinStackAlign;
    Locs.resize(NumLocs);
  }
  diff --git a/test/CodeGen/X86/aligned-variadic.ll 
b/test/CodeGen/X86/aligned-variadic.ll
index 2941592..2eb8a58 100644
--- a/test/CodeGen/X86/aligned-variadic.ll
+++ b/test/CodeGen/X86/aligned-variadic.ll
@@ -15,7 +15,7 @@ entry:
    %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8
    %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, 
i64 24
    store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
-; X32: leal    68(%esp), [[REG:%.*]]
+; X32: leal    72(%esp), [[REG:%.*]]
  ; X32: movl    [[REG]], 16(%esp)
  ; X64: leaq    232(%rsp), [[REG:%.*]]
  ; X64: movq    [[REG]], 184(%rsp)
diff --git a/test/CodeGen/X86/win32-spill-xmm.ll 
b/test/CodeGen/X86/win32-spill-xmm.ll
new file mode 100644
index 0000000..80741dd
--- /dev/null
+++ b/test/CodeGen/X86/win32-spill-xmm.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s 
| FileCheck %s
+; CHECK: subl    $32, %esp
+; CHECK: movaps  %xmm3, (%esp)
+; CHECK: movl    $0, 16(%esp)
+
+declare void @bar(<16 x float> %a, i32 %b) nounwind
+
+define void @foo(i32, <16 x float> * nocapture readonly) nounwind {
+entry:
+  %2 = alloca i32, i32 %0
+  %3 = load <16 x float>, <16 x float> * %1, align 64
+  tail call void @bar(<16 x float> %3, i32 0) nounwind
+  ret void
+}
-- 
1.9.5.msysgit.1