[llvm] c9c930a - [SelectionDAG] Don't promote the alignment of allocas beyond the stack alignment.

Mon May 11 17:40:02 PDT 2020

Author: Eli Friedman
Date: 2020-05-11T17:39:00-07:00
New Revision: c9c930ae67c38b93451aa979de723723aec0067d

URL: https://github.com/llvm/llvm-project/commit/c9c930ae67c38b93451aa979de723723aec0067d
DIFF: https://github.com/llvm/llvm-project/commit/c9c930ae67c38b93451aa979de723723aec0067d.diff

LOG: [SelectionDAG] Don't promote the alignment of allocas beyond the stack alignment.

allocas in LLVM IR have a specified alignment. When that alignment is
specified, the alloca has at least that alignment at runtime.

If the specified type of the alloca has a higher preferred alignment,
SelectionDAG currently ignores that specified alignment, and increases
the alignment. It does this even if it would trigger stack realignment.
I don't think this makes sense, so this patch changes that.

I was looking into this for SVE in particular: for SVE, overaligning
vscale'ed types is extra expensive because it requires realigning the
stack multiple times, or using dynamic allocation. (This currently isn't
implemented.)

I updated the expected assembly for a couple tests; in particular, for
arg-copy-elide.ll, the optimization in question does not increase the
alignment the way SelectionDAG normally would. For the rest, I just
increased the specified alignment on the allocas to match what
SelectionDAG was inferring.

Differential Revision: https://reviews.llvm.org/D79532

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
    llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll
    llvm/test/CodeGen/AMDGPU/private-element-size.ll
    llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
    llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
    llvm/test/CodeGen/Thumb2/mve-basic.ll
    llvm/test/CodeGen/X86/arg-copy-elide.ll
    llvm/test/CodeGen/X86/avx2-vbroadcast.ll
    llvm/test/CodeGen/X86/avx512-intel-ocl.ll
    llvm/test/CodeGen/X86/load-local-v3i129.ll
    llvm/test/CodeGen/X86/movtopush.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 3f302d5fa0ca..c4821254f897 100644

--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -134,8 +134,20 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         Type *Ty = AI->getAllocatedType();
+        Align TyPrefAlign = MF->getDataLayout().getPrefTypeAlign(Ty);
+        // The "specified" alignment is the alignment written on the alloca,
+        // or the preferred alignment of the type if none is specified.
+        //
+        // (Unspecified alignment on allocas will be going away soon.)
+        Align SpecifiedAlign = AI->getAlign() ? *AI->getAlign() : TyPrefAlign;
+
+        // If the preferred alignment of the type is higher than the specified
+        // alignment of the alloca, promote the alignment, as long as it doesn't
+        // require realigning the stack.
+        //
+        // FIXME: Do we really want to second-guess the IR in isel?
         Align Alignment =
-            max(MF->getDataLayout().getPrefTypeAlign(Ty), AI->getAlign());
+            std::max(std::min(TyPrefAlign, StackAlign), SpecifiedAlign);
 
         // Static allocas can be folded into the initial stack frame
         // adjustment. For targets that don't realign the stack, don't

diff  --git a/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll b/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll
index 4aae5bef22e2..915ffcf17b05 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll
@@ -15,3 +15,18 @@ define i32 @foo(<vscale x 16 x i8> %val) {
 }
 
 declare i32 @bar(<vscale x 16 x i8>* %ptr);
+
+; CHECKCG-LABEL: foo2:
+; CHECKCG: addvl   sp, sp, #-2
+
+; CHECKISEL-LABEL: name: foo2
+; CHECKISEL:       stack:
+; CHECKISEL:       id: 0, name: ptr, type: default, offset: 0, size: 32, alignment: 16,
+; CHECKISEL-NEXT:  stack-id: sve-vec
+
+define i32 @foo2(<vscale x 32 x i8> %val) {
+  %ptr = alloca <vscale x 32 x i8>, align 16
+  %res = call i32 @bar2(<vscale x 32 x i8>* %ptr)
+  ret i32 %res
+}
+declare i32 @bar2(<vscale x 32 x i8>* %ptr);

diff  --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
index d5e5ba5202fc..843f554b0513 100644
--- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
@@ -113,7 +113,7 @@ entry:
   %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
   %index.load = load i32, i32 addrspace(1)* %gep.index
   %index = and i32 %index.load, 2
-  %alloca = alloca [2 x <8 x i32>], align 16, addrspace(5)
+  %alloca = alloca [2 x <8 x i32>], align 32, addrspace(5)
   %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 0
   %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 1
   store <8 x i32> zeroinitializer, <8 x i32> addrspace(5)* %gep0

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index c8fb1ec72d50..32bb1ad95262 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -51,8 +51,8 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; GCN-NEXT:  BB0_2: ; %shader_eval_surface.exit
 ; GCN-NEXT:    s_endpgm
 entry:
-  %sd = alloca < 1339 x i32>, align 16, addrspace(5)
-  %state = alloca <4 x i32>, align 4, addrspace(5)
+  %sd = alloca < 1339 x i32>, align 8192, addrspace(5)
+  %state = alloca <4 x i32>, align 16, addrspace(5)
   %rslt = call i32 @svm_eval_nodes(float addrspace(5)* %kg, <1339 x i32> addrspace(5)* %sd, <4 x i32> addrspace(5)* %state, i32 0, i32 4194304)
   %cmp = icmp eq i32 %rslt, 0
   br i1 %cmp, label %shader_eval_surface.exit, label %if.then4.i

diff  --git a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
index 1d759ba21ec7..b6f3e0747568 100644
--- a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
+++ b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -34,7 +34,7 @@ entry:
 ; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
 ; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
 ; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
- %retval = alloca <16 x float>, align 16
+ %retval = alloca <16 x float>, align 64
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
  %1 = load <16 x float>, <16 x float>* %retval
@@ -73,7 +73,7 @@ entry:
 ; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
 
 
-%retval = alloca <16 x float>, align 16
+%retval = alloca <16 x float>, align 64
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
  %1 = load <16 x float>, <16 x float>* %retval

diff  --git a/llvm/test/CodeGen/Thumb2/mve-basic.ll b/llvm/test/CodeGen/Thumb2/mve-basic.ll
index 2b5ef2f014a7..797295c5b201 100644
--- a/llvm/test/CodeGen/Thumb2/mve-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-basic.ll
@@ -44,18 +44,12 @@ define void @vector_f64_copy(<2 x double>* %from, <2 x double>* %to) {
 define arm_aapcs_vfpcc <16 x i8> @stack_slot_handling(<16 x i8> %a) #0 {
 ; CHECK-LABEL: stack_slot_handling:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    push {r4, r6, r7, lr}
-; CHECK-NEXT:    add r7, sp, #8
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    mov r4, sp
-; CHECK-NEXT:    bfc r4, #0, #4
-; CHECK-NEXT:    mov sp, r4
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    sub.w r4, r7, #8
-; CHECK-NEXT:    mov sp, r4
-; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    bx lr
 entry:
   %a.addr = alloca <16 x i8>, align 8
   store <16 x i8> %a, <16 x i8>* %a.addr, align 8

diff  --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 603e50ff30a3..705a35615561 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -53,22 +53,18 @@ entry:
 }
 
 ; CHECK-LABEL: _split_i64:
-; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
 ; CHECK: pushl %[[csr2:[^ ]*]]
 ; CHECK: pushl %[[csr1:[^ ]*]]
-; CHECK: andl $-8, %esp
-; CHECK-DAG: movl 8(%ebp), %[[csr1]]
-; CHECK-DAG: movl 12(%ebp), %[[csr2]]
-; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK-DAG: movl 12(%esp), %[[csr1]]
+; CHECK-DAG: movl 16(%esp), %[[csr2]]
+; CHECK-DAG: leal 12(%esp), %[[reg:[^ ]*]]
 ; CHECK: pushl %[[reg]]
 ; CHECK: calll _addrof_i64
+; CHECK: addl $4, %esp
 ; CHECK-DAG: movl %[[csr1]], %eax
 ; CHECK-DAG: movl %[[csr2]], %edx
-; CHECK: leal -8(%ebp), %esp
 ; CHECK: popl %[[csr1]]
 ; CHECK: popl %[[csr2]]
-; CHECK: popl %ebp
 ; CHECK: retl
 
 define i1 @i1_arg(i1 %x) {
@@ -101,16 +97,13 @@ entry:
 }
 
 ; CHECK-LABEL: _fastcc_split_i64:
-; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
 ; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
-; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
+; CHECK-DAG: movl 20(%esp), %[[r2:[^ ]*]]
 ; CHECK-DAG: movl %[[r2]], 4(%esp)
 ; CHECK-DAG: movl %edx, (%esp)
 ; CHECK: movl %esp, %[[reg:[^ ]*]]
 ; CHECK: pushl %[[reg]]
 ; CHECK: calll _addrof_i64
-; CHECK: popl %ebp
 ; CHECK: retl
 
 

diff  --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index da77ee5be7fc..edae4f094c9b 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1164,9 +1164,9 @@ define void @isel_crash_32b(i8* %cV_R.addr) {
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 eintry:
-  %__a.addr.i = alloca <4 x i64>, align 16
-  %__b.addr.i = alloca <4 x i64>, align 16
-  %vCr = alloca <4 x i64>, align 16
+  %__a.addr.i = alloca <4 x i64>, align 32
+  %__b.addr.i = alloca <4 x i64>, align 32
+  %vCr = alloca <4 x i64>, align 32
   store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
   %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
   %tmp2 = load i8, i8* %cV_R.addr, align 4
@@ -1255,9 +1255,9 @@ define void @isel_crash_16w(i16* %cV_R.addr) {
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 eintry:
-  %__a.addr.i = alloca <4 x i64>, align 16
-  %__b.addr.i = alloca <4 x i64>, align 16
-  %vCr = alloca <4 x i64>, align 16
+  %__a.addr.i = alloca <4 x i64>, align 32
+  %__b.addr.i = alloca <4 x i64>, align 32
+  %vCr = alloca <4 x i64>, align 32
   store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
   %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
   %tmp2 = load i16, i16* %cV_R.addr, align 4
@@ -1346,9 +1346,9 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 eintry:
-  %__a.addr.i = alloca <4 x i64>, align 16
-  %__b.addr.i = alloca <4 x i64>, align 16
-  %vCr = alloca <4 x i64>, align 16
+  %__a.addr.i = alloca <4 x i64>, align 32
+  %__b.addr.i = alloca <4 x i64>, align 32
+  %vCr = alloca <4 x i64>, align 32
   store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
   %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
   %tmp2 = load i32, i32* %cV_R.addr, align 4
@@ -1436,9 +1436,9 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 eintry:
-  %__a.addr.i = alloca <4 x i64>, align 16
-  %__b.addr.i = alloca <4 x i64>, align 16
-  %vCr = alloca <4 x i64>, align 16
+  %__a.addr.i = alloca <4 x i64>, align 32
+  %__b.addr.i = alloca <4 x i64>, align 32
+  %vCr = alloca <4 x i64>, align 32
   store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
   %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
   %tmp2 = load i64, i64* %cV_R.addr, align 4

diff  --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
index 232b598c6109..b92854125298 100644
--- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -77,7 +77,7 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
-  %y = alloca <16 x float>, align 16
+  %y = alloca <16 x float>, align 64
   %x = fadd <16 x float> %a, %b
   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
   %2 = load <16 x float>, <16 x float>* %y, align 16
@@ -158,7 +158,7 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
-  %y = alloca <16 x float>, align 16
+  %y = alloca <16 x float>, align 64
   %x = fadd <16 x float> %a, %b
   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
   %2 = load <16 x float>, <16 x float>* %y, align 16

diff  --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index 00b53ef9740b..8484f6624690 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -4,26 +4,20 @@
 define void @_start() {
 ; CHECK-LABEL: _start:
 ; CHECK:       # %bb.0: # %Entry
-; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-; CHECK-NEXT:    andq $-128, %rsp
-; CHECK-NEXT:    subq $256, %rsp # imm = 0x100
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    shrdq $2, %rcx, %rax
 ; CHECK-NEXT:    shrq $2, %rcx
 ; CHECK-NEXT:    leaq 1(,%rax,4), %rdx
-; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    shrdq $62, %rcx, %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    orq $-2, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rbp, %rsp
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    orq $-2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 Entry:
   %y = alloca <3 x i129>, align 4

diff  --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll
index 7c067beb58ad..9f301a22a481 100644
--- a/llvm/test/CodeGen/X86/movtopush.ll
+++ b/llvm/test/CodeGen/X86/movtopush.ll
@@ -246,7 +246,7 @@ define void @test9() optsize {
 entry:
   %p = alloca i32, align 4
   %q = alloca i32, align 4
-  %s = alloca %struct.s, align 4  
+  %s = alloca %struct.s, align 8
   call void @good(i32 1, i32 2, i32 3, i32 4)
   %pv = ptrtoint i32* %p to i32
   %qv = ptrtoint i32* %q to i32
@@ -407,7 +407,7 @@ declare void @B_func(%struct.B* sret, %struct.B*, i32)
 define void @test14(%struct.A* %a) {
 entry:
   %ref.tmp = alloca %struct.B, align 1
-  %agg.tmp = alloca i64, align 4
+  %agg.tmp = alloca i64, align 8
   %tmpcast = bitcast i64* %agg.tmp to %struct.A*
   %tmp = alloca %struct.B, align 1
   %0 = bitcast %struct.A* %a to i64*