[llvm] r313458 - [x86] enable storeOfVectorConstantIsCheap() target hook

Sat Sep 16 06:29:12 PDT 2017

Author: spatel
Date: Sat Sep 16 06:29:12 2017
New Revision: 313458

URL: http://llvm.org/viewvc/llvm-project?rev=313458&view=rev
Log:
[x86] enable storeOfVectorConstantIsCheap() target hook

This allows vector-sized store merging of constants in DAGCombiner using the existing code in MergeConsecutiveStores(). 
All of the twisted logic that decides exactly what vector operations are legal and fast for each particular CPU are 
handled separately in there using the appropriate hooks.

For the motivating tests in merge-store-constants.ll, we already produce the same vector code in IR via the SLP vectorizer. 
So this is just providing a backend backstop for code that doesn't go through that pass (-O1). More details in PR24449:
https://bugs.llvm.org/show_bug.cgi?id=24449 (this change should be the last step to resolve that bug)

Differential Revision: https://reviews.llvm.org/D37451

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/test/CodeGen/X86/avx512-regcall-Mask.ll
    llvm/trunk/test/CodeGen/X86/merge-store-constants.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=313458&r1=313457&r2=313458&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Sat Sep 16 06:29:12 2017
@@ -1037,6 +1037,13 @@ namespace llvm {
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                  unsigned Index) const override;
 
+    bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
+                                      unsigned AddrSpace) const override {
+      // If we can replace more than 2 scalar stores, there will be a reduction
+      // in instructions even after we add a vector constant load.
+      return NumElem > 2;
+    }
+
     /// Intel processors have a unified instruction and data cache
     const char * getClearCacheBuiltinName() const override {
       return nullptr; // nothing to do, move along.

Modified: llvm/trunk/test/CodeGen/X86/avx512-regcall-Mask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-regcall-Mask.ll?rev=313458&r1=313457&r2=313458&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-regcall-Mask.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-regcall-Mask.ll Sat Sep 16 06:29:12 2017
@@ -96,34 +96,21 @@ define x86_regcallcc i64 @test_argv64i1(
 }
 
 ; X32-LABEL:  caller_argv64i1:
-; X32:        movl    $2, %eax
-; X32:        movl    $1, %ecx
-; X32:        movl    $2, %edx
-; X32:        movl    $1, %edi
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        call{{.*}}   _test_argv64i1
-        
+; X32:  pushl %edi
+; X32:  subl  $88, %esp
+; X32:  vmovaps __xmm at 00000001000000020000000100000002, %xmm0 # xmm0 = [2,1,2,1]
+; X32:  vmovups %xmm0, 64(%esp)
+; X32:  vmovaps LCPI1_1, %zmm0          # zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32:  vmovups %zmm0, (%esp)
+; X32:  movl  $1, 84(%esp)
+; X32:  movl  $2, 80(%esp)
+; X32:  movl  $2, %eax
+; X32:  movl  $1, %ecx
+; X32:  movl  $2, %edx
+; X32:  movl  $1, %edi
+; X32:  vzeroupper
+; X32:  calll _test_argv64i1
+ 
 ; WIN64-LABEL: caller_argv64i1:
 ; WIN64:       movabsq    $4294967298, %rax
 ; WIN64:       movq   %rax, (%rsp)

Modified: llvm/trunk/test/CodeGen/X86/merge-store-constants.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-store-constants.ll?rev=313458&r1=313457&r2=313458&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-store-constants.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-store-constants.ll Sat Sep 16 06:29:12 2017
@@ -6,18 +6,14 @@ define void @big_nonzero_16_bytes(i32* n
 ; X32-LABEL: big_nonzero_16_bytes:
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl $1, (%eax)
-; X32-NEXT:    movl $2, 4(%eax)
-; X32-NEXT:    movl $3, 8(%eax)
-; X32-NEXT:    movl $4, 12(%eax)
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_16_bytes:
 ; X64:       # BB#0:
-; X64-NEXT:    movabsq $8589934593, %rax # imm = 0x200000001
-; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    movabsq $17179869187, %rax # imm = 0x400000003
-; X64-NEXT:    movq %rax, 8(%rdi)
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X64-NEXT:    vmovups %xmm0, (%rdi)
 ; X64-NEXT:    retq
   %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
   %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
@@ -30,29 +26,48 @@ define void @big_nonzero_16_bytes(i32* n
   ret void
 }
 
+; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
+; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
+; it takes extra instructions to do this in scalar.
+
+define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) {
+; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,3]
+; X32-NEXT:    vmovups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $4294967297, %rax # imm = 0x100000001
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    movabsq $12884901889, %rax # imm = 0x300000001
+; X64-NEXT:    movq %rax, 8(%rdi)
+; X64-NEXT:    retq
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1
+
+  store i64 4294967297, i64* %a
+  store i64 12884901889, i64* %arrayidx1
+  ret void
+}
+
 ; Splats may be an opportunity to use a broadcast op.
 
 define void @big_nonzero_32_bytes_splat(i32* nocapture %a) {
 ; X32-LABEL: big_nonzero_32_bytes_splat:
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl $42, (%eax)
-; X32-NEXT:    movl $42, 4(%eax)
-; X32-NEXT:    movl $42, 8(%eax)
-; X32-NEXT:    movl $42, 12(%eax)
-; X32-NEXT:    movl $42, 16(%eax)
-; X32-NEXT:    movl $42, 20(%eax)
-; X32-NEXT:    movl $42, 24(%eax)
-; X32-NEXT:    movl $42, 28(%eax)
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X32-NEXT:    vmovups %ymm0, (%eax)
+; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_32_bytes_splat:
 ; X64:       # BB#0:
-; X64-NEXT:    movabsq $180388626474, %rax # imm = 0x2A0000002A
-; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    movq %rax, 8(%rdi)
-; X64-NEXT:    movq %rax, 16(%rdi)
-; X64-NEXT:    movq %rax, 24(%rdi)
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X64-NEXT:    vmovups %ymm0, (%rdi)
+; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
   %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
@@ -79,37 +94,29 @@ define void @big_nonzero_63_bytes(i8* no
 ; X32-LABEL: big_nonzero_63_bytes:
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl $0, 4(%eax)
-; X32-NEXT:    movl $1, (%eax)
-; X32-NEXT:    movl $0, 12(%eax)
-; X32-NEXT:    movl $2, 8(%eax)
-; X32-NEXT:    movl $0, 20(%eax)
-; X32-NEXT:    movl $3, 16(%eax)
-; X32-NEXT:    movl $0, 28(%eax)
-; X32-NEXT:    movl $4, 24(%eax)
-; X32-NEXT:    movl $0, 36(%eax)
-; X32-NEXT:    movl $5, 32(%eax)
-; X32-NEXT:    movl $0, 44(%eax)
-; X32-NEXT:    movl $6, 40(%eax)
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
+; X32-NEXT:    vmovups %ymm0, (%eax)
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [5,0,6,0]
+; X32-NEXT:    vmovups %xmm0, 32(%eax)
 ; X32-NEXT:    movl $0, 52(%eax)
 ; X32-NEXT:    movl $7, 48(%eax)
 ; X32-NEXT:    movl $8, 56(%eax)
 ; X32-NEXT:    movw $9, 60(%eax)
 ; X32-NEXT:    movb $10, 62(%eax)
+; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_63_bytes:
 ; X64:       # BB#0:
-; X64-NEXT:    movq $1, (%rdi)
-; X64-NEXT:    movq $2, 8(%rdi)
-; X64-NEXT:    movq $3, 16(%rdi)
-; X64-NEXT:    movq $4, 24(%rdi)
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,3,4]
+; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    movq $5, 32(%rdi)
 ; X64-NEXT:    movq $6, 40(%rdi)
 ; X64-NEXT:    movq $7, 48(%rdi)
 ; X64-NEXT:    movl $8, 56(%rdi)
 ; X64-NEXT:    movw $9, 60(%rdi)
 ; X64-NEXT:    movb $10, 62(%rdi)
+; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %a8 = bitcast i8* %a to i64*
   %arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1