[llvm] r372540 - Prefer AVX512 memcpy when applicable

David Zarzycki via llvm-commits llvm-commits at lists.llvm.org
Sun Sep 22 22:00:59 PDT 2019


Author: davezarzycki
Date: Sun Sep 22 22:00:59 2019
New Revision: 372540

URL: http://llvm.org/viewvc/llvm-project?rev=372540&view=rev
Log:
Prefer AVX512 memcpy when applicable

When AVX512 is available and the preferred vector width is 512-bits or
more, we should prefer AVX512 for memcpy().

https://bugs.llvm.org/show_bug.cgi?id=43240

https://reviews.llvm.org/D67874

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/memcpy.ll
    llvm/trunk/test/CodeGen/X86/memset-nonzero.ll
    llvm/trunk/test/CodeGen/X86/memset-zero.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=372540&r1=372539&r2=372540&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Sep 22 22:00:59 2019
@@ -2149,6 +2149,11 @@ EVT X86TargetLowering::getOptimalMemOpTy
     if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
                        ((DstAlign == 0 || DstAlign >= 16) &&
                         (SrcAlign == 0 || SrcAlign >= 16)))) {
+      // FIXME: Check if unaligned 64-byte accesses are slow.
+      if (Size >= 64 && Subtarget.hasAVX512() &&
+          (Subtarget.getPreferVectorWidth() >= 512)) {
+        return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
+      }
       // FIXME: Check if unaligned 32-byte accesses are slow.
       if (Size >= 32 && Subtarget.hasAVX() &&
           (Subtarget.getPreferVectorWidth() >= 256)) {

Modified: llvm/trunk/test/CodeGen/X86/memcpy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/memcpy.ll?rev=372540&r1=372539&r2=372540&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/memcpy.ll (original)
+++ llvm/trunk/test/CodeGen/X86/memcpy.ll Sun Sep 22 22:00:59 2019
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin      -mcpu=core2   | FileCheck %s -check-prefix=DARWIN
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2   | FileCheck %s -check-prefix=LINUX
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s -check-prefix=LINUX-SKL
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx     | FileCheck %s -check-prefix=LINUX-SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl     | FileCheck %s -check-prefix=LINUX-KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin      -mcpu=core2     | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2     | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake   | FileCheck %s -check-prefix=LINUX-SKL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx       | FileCheck %s -check-prefix=LINUX-SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl       | FileCheck %s -check-prefix=LINUX-KNL
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512bw | FileCheck %s -check-prefix=LINUX-AVX512BW
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
@@ -124,10 +124,8 @@ define void @test3(i8* nocapture %A, i8*
 ;
 ; LINUX-KNL-LABEL: test3:
 ; LINUX-KNL:       # %bb.0: # %entry
-; LINUX-KNL-NEXT:    vmovups (%rsi), %ymm0
-; LINUX-KNL-NEXT:    vmovups 32(%rsi), %ymm1
-; LINUX-KNL-NEXT:    vmovups %ymm1, 32(%rdi)
-; LINUX-KNL-NEXT:    vmovups %ymm0, (%rdi)
+; LINUX-KNL-NEXT:    vmovups (%rsi), %zmm0
+; LINUX-KNL-NEXT:    vmovups %zmm0, (%rdi)
 ; LINUX-KNL-NEXT:    retq
 ;
 ; LINUX-AVX512BW-LABEL: test3:
@@ -174,10 +172,8 @@ define void @test3_minsize(i8* nocapture
 ;
 ; LINUX-KNL-LABEL: test3_minsize:
 ; LINUX-KNL:       # %bb.0:
-; LINUX-KNL-NEXT:    vmovups (%rsi), %ymm0
-; LINUX-KNL-NEXT:    vmovups 32(%rsi), %ymm1
-; LINUX-KNL-NEXT:    vmovups %ymm1, 32(%rdi)
-; LINUX-KNL-NEXT:    vmovups %ymm0, (%rdi)
+; LINUX-KNL-NEXT:    vmovups (%rsi), %zmm0
+; LINUX-KNL-NEXT:    vmovups %zmm0, (%rdi)
 ; LINUX-KNL-NEXT:    retq
 ;
 ; LINUX-AVX512BW-LABEL: test3_minsize:
@@ -223,10 +219,8 @@ define void @test3_minsize_optsize(i8* n
 ;
 ; LINUX-KNL-LABEL: test3_minsize_optsize:
 ; LINUX-KNL:       # %bb.0:
-; LINUX-KNL-NEXT:    vmovups (%rsi), %ymm0
-; LINUX-KNL-NEXT:    vmovups 32(%rsi), %ymm1
-; LINUX-KNL-NEXT:    vmovups %ymm1, 32(%rdi)
-; LINUX-KNL-NEXT:    vmovups %ymm0, (%rdi)
+; LINUX-KNL-NEXT:    vmovups (%rsi), %zmm0
+; LINUX-KNL-NEXT:    vmovups %zmm0, (%rdi)
 ; LINUX-KNL-NEXT:    retq
 ;
 ; LINUX-AVX512BW-LABEL: test3_minsize_optsize:
@@ -301,10 +295,8 @@ define void @test4(i8* nocapture %A, i8*
 ;
 ; LINUX-KNL-LABEL: test4:
 ; LINUX-KNL:       # %bb.0: # %entry
-; LINUX-KNL-NEXT:    vmovups (%rsi), %ymm0
-; LINUX-KNL-NEXT:    vmovups 32(%rsi), %ymm1
-; LINUX-KNL-NEXT:    vmovups %ymm1, 32(%rdi)
-; LINUX-KNL-NEXT:    vmovups %ymm0, (%rdi)
+; LINUX-KNL-NEXT:    vmovups (%rsi), %zmm0
+; LINUX-KNL-NEXT:    vmovups %zmm0, (%rdi)
 ; LINUX-KNL-NEXT:    retq
 ;
 ; LINUX-AVX512BW-LABEL: test4:

Modified: llvm/trunk/test/CodeGen/X86/memset-nonzero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/memset-nonzero.ll?rev=372540&r1=372539&r2=372540&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/memset-nonzero.ll (original)
+++ llvm/trunk/test/CodeGen/X86/memset-nonzero.ll Sun Sep 22 22:00:59 2019
@@ -1,11 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse  | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx  | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512-ymm
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; https://llvm.org/bugs/show_bug.cgi?id=27100
 
@@ -82,13 +85,44 @@ define void @memset_64_nonzero_bytes(i8*
 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
 ; SSE2FAST-NEXT:    retq
 ;
-; AVX-LABEL: memset_64_nonzero_bytes:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: memset_64_nonzero_bytes:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: memset_64_nonzero_bytes:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-ymm-LABEL: memset_64_nonzero_bytes:
+; AVX512-ymm:       # %bb.0:
+; AVX512-ymm-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512-ymm-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512-ymm-NEXT:    vzeroupper
+; AVX512-ymm-NEXT:    retq
+;
+; AVX512F-LABEL: memset_64_nonzero_bytes:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
+; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: memset_64_nonzero_bytes:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+; AVX512NW-NEXT: retq
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
   ret void
 }
@@ -128,15 +162,51 @@ define void @memset_128_nonzero_bytes(i8
 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
 ; SSE2FAST-NEXT:    retq
 ;
-; AVX-LABEL: memset_128_nonzero_bytes:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
-; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: memset_128_nonzero_bytes:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: memset_128_nonzero_bytes:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-ymm-LABEL: memset_128_nonzero_bytes:
+; AVX512-ymm:       # %bb.0:
+; AVX512-ymm-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512-ymm-NEXT:    vmovups %ymm0, 96(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512-ymm-NEXT:    vzeroupper
+; AVX512-ymm-NEXT:    retq
+;
+; AVX512F-LABEL: memset_128_nonzero_bytes:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
+; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
+; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: memset_128_nonzero_bytes:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
+; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
   ret void
 }
@@ -174,19 +244,67 @@ define void @memset_256_nonzero_bytes(i8
 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
 ; SSE2FAST-NEXT:    retq
 ;
-; AVX-LABEL: memset_256_nonzero_bytes:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
-; AVX-NEXT:    vmovups %ymm0, 224(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 192(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 160(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 128(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: memset_256_nonzero_bytes:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: memset_256_nonzero_bytes:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT:    vmovups %ymm0, 224(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 192(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 160(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 128(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-NEXT:    vmovups %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-ymm-LABEL: memset_256_nonzero_bytes:
+; AVX512-ymm:       # %bb.0:
+; AVX512-ymm-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512-ymm-NEXT:    vmovups %ymm0, 224(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 192(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 160(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 128(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 96(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX512-ymm-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512-ymm-NEXT:    vzeroupper
+; AVX512-ymm-NEXT:    retq
+;
+; AVX512F-LABEL: memset_256_nonzero_bytes:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
+; AVX512F-NEXT:    vmovups %zmm0, 192(%rdi)
+; AVX512F-NEXT:    vmovups %zmm0, 128(%rdi)
+; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
+; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: memset_256_nonzero_bytes:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT:    vmovups %zmm0, 192(%rdi)
+; AVX512BW-NEXT:    vmovups %zmm0, 128(%rdi)
+; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
+; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
   ret void
 }
@@ -340,14 +458,30 @@ define void @memset_64_nonconst_bytes(i8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: memset_64_nonconst_bytes:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovd %esi, %xmm0
-; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, 32(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512-ymm-LABEL: memset_64_nonconst_bytes:
+; AVX512-ymm:       # %bb.0:
+; AVX512-ymm-NEXT:    vmovd %esi, %xmm0
+; AVX512-ymm-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 32(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512-ymm-NEXT:    vzeroupper
+; AVX512-ymm-NEXT:    retq
+;
+; AVX512F-LABEL: memset_64_nonconst_bytes:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movzbl %sil, %eax
+; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: memset_64_nonconst_bytes:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false)
   ret void
 }
@@ -417,16 +551,34 @@ define void @memset_128_nonconst_bytes(i
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: memset_128_nonconst_bytes:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovd %esi, %xmm0
-; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, 96(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 64(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 32(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512-ymm-LABEL: memset_128_nonconst_bytes:
+; AVX512-ymm:       # %bb.0:
+; AVX512-ymm-NEXT:    vmovd %esi, %xmm0
+; AVX512-ymm-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 96(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 64(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 32(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512-ymm-NEXT:    vzeroupper
+; AVX512-ymm-NEXT:    retq
+;
+; AVX512F-LABEL: memset_128_nonconst_bytes:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movzbl %sil, %eax
+; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: memset_128_nonconst_bytes:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false)
   ret void
 }
@@ -493,20 +645,42 @@ define void @memset_256_nonconst_bytes(i
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: memset_256_nonconst_bytes:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovd %esi, %xmm0
-; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, 224(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 192(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 160(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 128(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 96(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 64(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, 32(%rdi)
-; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512-ymm-LABEL: memset_256_nonconst_bytes:
+; AVX512-ymm:       # %bb.0:
+; AVX512-ymm-NEXT:    vmovd %esi, %xmm0
+; AVX512-ymm-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 224(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 192(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 160(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 128(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 96(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 64(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, 32(%rdi)
+; AVX512-ymm-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512-ymm-NEXT:    vzeroupper
+; AVX512-ymm-NEXT:    retq
+;
+; AVX512F-LABEL: memset_256_nonconst_bytes:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movzbl %sil, %eax
+; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: memset_256_nonconst_bytes:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false)
   ret void
 }

Modified: llvm/trunk/test/CodeGen/X86/memset-zero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/memset-zero.ll?rev=372540&r1=372539&r2=372540&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/memset-zero.ll (original)
+++ llvm/trunk/test/CodeGen/X86/memset-zero.ll Sun Sep 22 22:00:59 2019
@@ -752,8 +752,7 @@ define void @memset_64(i8* %a) nounwind
 ; KNL-LABEL: memset_64:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vmovups %ymm0, 32(%rdi)
-; KNL-NEXT:    vmovups %ymm0, (%rdi)
+; KNL-NEXT:    vmovups %zmm0, (%rdi)
 ; KNL-NEXT:    retq
 entry:
 	call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false)
@@ -819,8 +818,7 @@ define void @memset_64_align64(i8* %a) n
 ; KNL-LABEL: memset_64_align64:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vmovaps %ymm0, 32(%rdi)
-; KNL-NEXT:    vmovaps %ymm0, (%rdi)
+; KNL-NEXT:    vmovaps %zmm0, (%rdi)
 ; KNL-NEXT:    retq
 entry:
 	call void @llvm.memset.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 false)




More information about the llvm-commits mailing list