[llvm] r326063 - [X86] Remove VT.isSimple() check from detectAVGPattern.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sun Feb 25 18:16:32 PST 2018


Author: ctopper
Date: Sun Feb 25 18:16:31 2018
New Revision: 326063

URL: http://llvm.org/viewvc/llvm-project?rev=326063&view=rev
Log:
[X86] Remove VT.isSimple() check from detectAVGPattern.

Which types are considered 'simple' is a function of the requirements of all targets that LLVM supports. That shouldn't directly affect what types we are able to handle. The remainder of this code checks that the number of elements is a power of 2 and takes care of splitting down to a legal size.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avg.ll
    llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=326063&r1=326062&r2=326063&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Feb 25 18:16:31 2018
@@ -34369,7 +34369,7 @@ static SDValue combineTruncateWithSat(SD
 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget,
                                 const SDLoc &DL) {
-  if (!VT.isVector() || !VT.isSimple())
+  if (!VT.isVector())
     return SDValue();
   EVT InVT = In.getValueType();
   unsigned NumElems = VT.getVectorNumElements();

Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=326063&r1=326062&r2=326063&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Sun Feb 25 18:16:31 2018
@@ -1651,3 +1651,375 @@ define <64 x i8> @avg_v64i8_3(<64 x i8>
   %res = trunc <64 x i16> %lshr to <64 x i8>
   ret <64 x i8> %res
 }
+
+define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
+; SSE2-LABEL: avg_v512i8_3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 496(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 480(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 464(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 448(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 432(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 416(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 400(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 384(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 368(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 352(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 336(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 320(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 304(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 288(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 272(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 256(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 224(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 208(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 192(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 176(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 160(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 144(%rdi)
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm8, 128(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: avg_v512i8_3:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    movq %rsp, %rbp
+; AVX1-NEXT:    andq $-32, %rsp
+; AVX1-NEXT:    subq $128, %rsp
+; AVX1-NEXT:    vmovdqa 144(%rbp), %ymm8
+; AVX1-NEXT:    vmovdqa 112(%rbp), %ymm9
+; AVX1-NEXT:    vmovdqa 80(%rbp), %ymm10
+; AVX1-NEXT:    vmovdqa 48(%rbp), %ymm11
+; AVX1-NEXT:    vmovdqa 16(%rbp), %ymm12
+; AVX1-NEXT:    vmovdqa 272(%rbp), %ymm13
+; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm14
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm15
+; AVX1-NEXT:    vpavgb %xmm14, %xmm15, %xmm14
+; AVX1-NEXT:    vmovdqa 304(%rbp), %ymm15
+; AVX1-NEXT:    vpavgb %xmm13, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm14
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpavgb %xmm14, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa 336(%rbp), %ymm14
+; AVX1-NEXT:    vpavgb %xmm15, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa 368(%rbp), %ymm1
+; AVX1-NEXT:    vpavgb %xmm14, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa 400(%rbp), %ymm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa 432(%rbp), %ymm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa 464(%rbp), %ymm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa 496(%rbp), %ymm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm2
+; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa 528(%rbp), %ymm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa 560(%rbp), %ymm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm12, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm12
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm2
+; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa 592(%rbp), %ymm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm11, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm11
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa 624(%rbp), %ymm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm10, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm10
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm2
+; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa 656(%rbp), %ymm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm9, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa 176(%rbp), %ymm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vmovdqa 688(%rbp), %ymm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm13
+; AVX1-NEXT:    vpavgb %xmm2, %xmm13, %xmm2
+; AVX1-NEXT:    vpavgb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa 208(%rbp), %ymm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm13
+; AVX1-NEXT:    vmovdqa 720(%rbp), %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm15
+; AVX1-NEXT:    vpavgb %xmm1, %xmm15, %xmm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovdqa 240(%rbp), %ymm15
+; AVX1-NEXT:    vmovdqa 752(%rbp), %ymm8
+; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm14
+; AVX1-NEXT:    vpavgb %xmm2, %xmm14, %xmm2
+; AVX1-NEXT:    vpavgb %xmm8, %xmm15, %xmm8
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm8, %ymm2
+; AVX1-NEXT:    vmovaps %ymm2, 480(%rdi)
+; AVX1-NEXT:    vmovaps %ymm1, 448(%rdi)
+; AVX1-NEXT:    vmovaps %ymm13, 416(%rdi)
+; AVX1-NEXT:    vmovaps %ymm0, 384(%rdi)
+; AVX1-NEXT:    vmovaps %ymm9, 352(%rdi)
+; AVX1-NEXT:    vmovaps %ymm10, 320(%rdi)
+; AVX1-NEXT:    vmovaps %ymm11, 288(%rdi)
+; AVX1-NEXT:    vmovaps %ymm12, 256(%rdi)
+; AVX1-NEXT:    vmovaps %ymm7, 224(%rdi)
+; AVX1-NEXT:    vmovaps %ymm6, 192(%rdi)
+; AVX1-NEXT:    vmovaps %ymm5, 160(%rdi)
+; AVX1-NEXT:    vmovaps %ymm4, 128(%rdi)
+; AVX1-NEXT:    vmovaps %ymm3, 96(%rdi)
+; AVX1-NEXT:    vmovaps (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm0, 64(%rdi)
+; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    movq %rbp, %rsp
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avg_v512i8_3:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $32, %rsp
+; AVX2-NEXT:    vmovdqa 240(%rbp), %ymm8
+; AVX2-NEXT:    vmovdqa 208(%rbp), %ymm9
+; AVX2-NEXT:    vmovdqa 176(%rbp), %ymm10
+; AVX2-NEXT:    vmovdqa 144(%rbp), %ymm11
+; AVX2-NEXT:    vmovdqa 112(%rbp), %ymm12
+; AVX2-NEXT:    vmovdqa 80(%rbp), %ymm13
+; AVX2-NEXT:    vmovdqa 48(%rbp), %ymm14
+; AVX2-NEXT:    vmovdqa 16(%rbp), %ymm15
+; AVX2-NEXT:    vpavgb 272(%rbp), %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb 304(%rbp), %ymm1, %ymm1
+; AVX2-NEXT:    vpavgb 336(%rbp), %ymm2, %ymm2
+; AVX2-NEXT:    vpavgb 368(%rbp), %ymm3, %ymm3
+; AVX2-NEXT:    vpavgb 400(%rbp), %ymm4, %ymm4
+; AVX2-NEXT:    vpavgb 432(%rbp), %ymm5, %ymm5
+; AVX2-NEXT:    vpavgb 464(%rbp), %ymm6, %ymm6
+; AVX2-NEXT:    vpavgb 496(%rbp), %ymm7, %ymm7
+; AVX2-NEXT:    vpavgb 528(%rbp), %ymm15, %ymm15
+; AVX2-NEXT:    vpavgb 560(%rbp), %ymm14, %ymm14
+; AVX2-NEXT:    vpavgb 592(%rbp), %ymm13, %ymm13
+; AVX2-NEXT:    vpavgb 624(%rbp), %ymm12, %ymm12
+; AVX2-NEXT:    vpavgb 656(%rbp), %ymm11, %ymm11
+; AVX2-NEXT:    vpavgb 688(%rbp), %ymm10, %ymm10
+; AVX2-NEXT:    vpavgb 720(%rbp), %ymm9, %ymm9
+; AVX2-NEXT:    vpavgb 752(%rbp), %ymm8, %ymm8
+; AVX2-NEXT:    vmovdqa %ymm8, 480(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm9, 448(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm10, 416(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm11, 384(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm12, 352(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm13, 320(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm14, 288(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm15, 256(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm7, 224(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm6, 192(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm5, 160(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm4, 128(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm3, 96(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm1, 32(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: avg_v512i8_3:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-32, %rsp
+; AVX512F-NEXT:    subq $32, %rsp
+; AVX512F-NEXT:    vmovdqa 240(%rbp), %ymm8
+; AVX512F-NEXT:    vmovdqa 208(%rbp), %ymm9
+; AVX512F-NEXT:    vmovdqa 176(%rbp), %ymm10
+; AVX512F-NEXT:    vmovdqa 144(%rbp), %ymm11
+; AVX512F-NEXT:    vmovdqa 112(%rbp), %ymm12
+; AVX512F-NEXT:    vmovdqa 80(%rbp), %ymm13
+; AVX512F-NEXT:    vmovdqa 48(%rbp), %ymm14
+; AVX512F-NEXT:    vmovdqa 16(%rbp), %ymm15
+; AVX512F-NEXT:    vpavgb 272(%rbp), %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb 304(%rbp), %ymm1, %ymm1
+; AVX512F-NEXT:    vpavgb 336(%rbp), %ymm2, %ymm2
+; AVX512F-NEXT:    vpavgb 368(%rbp), %ymm3, %ymm3
+; AVX512F-NEXT:    vpavgb 400(%rbp), %ymm4, %ymm4
+; AVX512F-NEXT:    vpavgb 432(%rbp), %ymm5, %ymm5
+; AVX512F-NEXT:    vpavgb 464(%rbp), %ymm6, %ymm6
+; AVX512F-NEXT:    vpavgb 496(%rbp), %ymm7, %ymm7
+; AVX512F-NEXT:    vpavgb 528(%rbp), %ymm15, %ymm15
+; AVX512F-NEXT:    vpavgb 560(%rbp), %ymm14, %ymm14
+; AVX512F-NEXT:    vpavgb 592(%rbp), %ymm13, %ymm13
+; AVX512F-NEXT:    vpavgb 624(%rbp), %ymm12, %ymm12
+; AVX512F-NEXT:    vpavgb 656(%rbp), %ymm11, %ymm11
+; AVX512F-NEXT:    vpavgb 688(%rbp), %ymm10, %ymm10
+; AVX512F-NEXT:    vpavgb 720(%rbp), %ymm9, %ymm9
+; AVX512F-NEXT:    vpavgb 752(%rbp), %ymm8, %ymm8
+; AVX512F-NEXT:    vmovdqa %ymm8, 480(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm9, 448(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm10, 416(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm11, 384(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm12, 352(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm13, 320(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm14, 288(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm15, 256(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm7, 224(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm6, 192(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm5, 160(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm4, 128(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm3, 96(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm2, 64(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v512i8_3:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $64, %rsp
+; AVX512BW-NEXT:    vpavgb 16(%rbp), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpavgb 80(%rbp), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpavgb 144(%rbp), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpavgb 208(%rbp), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpavgb 272(%rbp), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpavgb 336(%rbp), %zmm5, %zmm5
+; AVX512BW-NEXT:    vpavgb 400(%rbp), %zmm6, %zmm6
+; AVX512BW-NEXT:    vpavgb 464(%rbp), %zmm7, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %za = zext <512 x i8> %a to <512 x i16>
+  %zb = zext <512 x i8> %b to <512 x i16>
+  %add = add nuw nsw <512 x i16> %za, %zb
+  %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <512 x i16> %lshr to <512 x i8>
+  ret <512 x i8> %res
+}

Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll?rev=326063&r1=326062&r2=326063&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll Sun Feb 25 18:16:31 2018
@@ -641,3 +641,179 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b
   %res = bitcast <8 x i1> %x to i8
   ret i8 %res
 }
+
+define i64 @v16i8_widened_with_zeroes(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-SSSE3-LABEL: v16i8_widened_with_zeroes:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    leal (%rcx,%rax,2), %eax
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    leal (%rax,%rcx,4), %eax
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    leal (%rax,%rcx,8), %eax
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    shll $4, %ecx
+; SSE2-SSSE3-NEXT:    orl %eax, %ecx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    shll $5, %eax
+; SSE2-SSSE3-NEXT:    orl %ecx, %eax
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    shll $6, %ecx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    andl $1, %edx
+; SSE2-SSSE3-NEXT:    shll $7, %edx
+; SSE2-SSSE3-NEXT:    orl %ecx, %edx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    shll $8, %ecx
+; SSE2-SSSE3-NEXT:    orl %edx, %ecx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    andl $1, %edx
+; SSE2-SSSE3-NEXT:    shll $9, %edx
+; SSE2-SSSE3-NEXT:    orl %ecx, %edx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    shll $10, %ecx
+; SSE2-SSSE3-NEXT:    orl %edx, %ecx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    andl $1, %edx
+; SSE2-SSSE3-NEXT:    shll $11, %edx
+; SSE2-SSSE3-NEXT:    orl %ecx, %edx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    shll $12, %ecx
+; SSE2-SSSE3-NEXT:    orl %edx, %ecx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    andl $1, %edx
+; SSE2-SSSE3-NEXT:    shll $13, %edx
+; SSE2-SSSE3-NEXT:    orl %ecx, %edx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    andl $1, %ecx
+; SSE2-SSSE3-NEXT:    shll $14, %ecx
+; SSE2-SSSE3-NEXT:    orl %edx, %ecx
+; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    shll $15, %edx
+; SSE2-SSSE3-NEXT:    orl %ecx, %edx
+; SSE2-SSSE3-NEXT:    orl %eax, %edx
+; SSE2-SSSE3-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movzwl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    movl %edx, %eax
+; SSE2-SSSE3-NEXT:    shll $16, %eax
+; SSE2-SSSE3-NEXT:    orl %eax, %edx
+; SSE2-SSSE3-NEXT:    shlq $32, %rdx
+; SSE2-SSSE3-NEXT:    orl %ecx, %eax
+; SSE2-SSSE3-NEXT:    orq %rdx, %rax
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v16i8_widened_with_zeroes:
+; AVX12:       # %bb.0: # %entry
+; AVX12-NEXT:    pushq %rbp
+; AVX12-NEXT:    .cfi_def_cfa_offset 16
+; AVX12-NEXT:    .cfi_offset %rbp, -16
+; AVX12-NEXT:    movq %rsp, %rbp
+; AVX12-NEXT:    .cfi_def_cfa_register %rbp
+; AVX12-NEXT:    andq $-32, %rsp
+; AVX12-NEXT:    subq $64, %rsp
+; AVX12-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    leal (%rcx,%rax,2), %eax
+; AVX12-NEXT:    vpextrb $2, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    leal (%rax,%rcx,4), %eax
+; AVX12-NEXT:    vpextrb $3, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    leal (%rax,%rcx,8), %eax
+; AVX12-NEXT:    vpextrb $4, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    shll $4, %ecx
+; AVX12-NEXT:    orl %eax, %ecx
+; AVX12-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    shll $5, %eax
+; AVX12-NEXT:    orl %ecx, %eax
+; AVX12-NEXT:    vpextrb $6, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    shll $6, %ecx
+; AVX12-NEXT:    vpextrb $7, %xmm0, %edx
+; AVX12-NEXT:    andl $1, %edx
+; AVX12-NEXT:    shll $7, %edx
+; AVX12-NEXT:    orl %ecx, %edx
+; AVX12-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    shll $8, %ecx
+; AVX12-NEXT:    orl %edx, %ecx
+; AVX12-NEXT:    vpextrb $9, %xmm0, %edx
+; AVX12-NEXT:    andl $1, %edx
+; AVX12-NEXT:    shll $9, %edx
+; AVX12-NEXT:    orl %ecx, %edx
+; AVX12-NEXT:    vpextrb $10, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    shll $10, %ecx
+; AVX12-NEXT:    orl %edx, %ecx
+; AVX12-NEXT:    vpextrb $11, %xmm0, %edx
+; AVX12-NEXT:    andl $1, %edx
+; AVX12-NEXT:    shll $11, %edx
+; AVX12-NEXT:    orl %ecx, %edx
+; AVX12-NEXT:    vpextrb $12, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    shll $12, %ecx
+; AVX12-NEXT:    orl %edx, %ecx
+; AVX12-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX12-NEXT:    andl $1, %edx
+; AVX12-NEXT:    shll $13, %edx
+; AVX12-NEXT:    orl %ecx, %edx
+; AVX12-NEXT:    vpextrb $14, %xmm0, %ecx
+; AVX12-NEXT:    andl $1, %ecx
+; AVX12-NEXT:    shll $14, %ecx
+; AVX12-NEXT:    orl %edx, %ecx
+; AVX12-NEXT:    vpextrb $15, %xmm0, %edx
+; AVX12-NEXT:    andl $1, %edx
+; AVX12-NEXT:    shll $15, %edx
+; AVX12-NEXT:    orl %ecx, %edx
+; AVX12-NEXT:    orl %eax, %edx
+; AVX12-NEXT:    movl %edx, (%rsp)
+; AVX12-NEXT:    movl $0, {{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; AVX12-NEXT:    shlq $32, %rcx
+; AVX12-NEXT:    movl (%rsp), %eax
+; AVX12-NEXT:    orq %rcx, %rax
+; AVX12-NEXT:    movq %rbp, %rsp
+; AVX12-NEXT:    popq %rbp
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: v16i8_widened_with_zeroes:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movzwl %ax, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v16i8_widened_with_zeroes:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+entry:
+  %c = icmp eq <16 x i8> %a, %b
+  %d = shufflevector <16 x i1> %c, <16 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %e = bitcast <64 x i1> %d to i64
+  ret i64 %e
+}




More information about the llvm-commits mailing list