[llvm] r326063 - [X86] Remove VT.isSimple() check from detectAVGPattern.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 25 18:16:32 PST 2018
Author: ctopper
Date: Sun Feb 25 18:16:31 2018
New Revision: 326063
URL: http://llvm.org/viewvc/llvm-project?rev=326063&view=rev
Log:
[X86] Remove VT.isSimple() check from detectAVGPattern.
Which types are considered 'simple' is a function of the requirements of all targets that LLVM supports. That shouldn't directly affect what types we are able to handle. The remainder of this code checks that the number of elements is a power of 2 and takes care of splitting down to a legal size.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avg.ll
llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=326063&r1=326062&r2=326063&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Feb 25 18:16:31 2018
@@ -34369,7 +34369,7 @@ static SDValue combineTruncateWithSat(SD
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
- if (!VT.isVector() || !VT.isSimple())
+ if (!VT.isVector())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = VT.getVectorNumElements();
Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=326063&r1=326062&r2=326063&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Sun Feb 25 18:16:31 2018
@@ -1651,3 +1651,375 @@ define <64 x i8> @avg_v64i8_3(<64 x i8>
%res = trunc <64 x i16> %lshr to <64 x i8>
ret <64 x i8> %res
}
+
+define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
+; SSE2-LABEL: avg_v512i8_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 496(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 480(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 464(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 448(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 432(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 416(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 400(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 384(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 368(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 352(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 336(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 320(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 304(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 288(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 272(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 256(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 224(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 208(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 192(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 176(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 160(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 144(%rdi)
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm8, 128(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movdqa %xmm5, 80(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: avg_v512i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: vmovdqa 144(%rbp), %ymm8
+; AVX1-NEXT: vmovdqa 112(%rbp), %ymm9
+; AVX1-NEXT: vmovdqa 80(%rbp), %ymm10
+; AVX1-NEXT: vmovdqa 48(%rbp), %ymm11
+; AVX1-NEXT: vmovdqa 16(%rbp), %ymm12
+; AVX1-NEXT: vmovdqa 272(%rbp), %ymm13
+; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm14
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm15
+; AVX1-NEXT: vpavgb %xmm14, %xmm15, %xmm14
+; AVX1-NEXT: vmovdqa 304(%rbp), %ymm15
+; AVX1-NEXT: vpavgb %xmm13, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpavgb %xmm14, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa 336(%rbp), %ymm14
+; AVX1-NEXT: vpavgb %xmm15, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 368(%rbp), %ymm1
+; AVX1-NEXT: vpavgb %xmm14, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa 400(%rbp), %ymm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 432(%rbp), %ymm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa 464(%rbp), %ymm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 496(%rbp), %ymm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm6
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2
+; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa 528(%rbp), %ymm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 560(%rbp), %ymm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm12, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm12
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm2
+; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa 592(%rbp), %ymm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm11, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 624(%rbp), %ymm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm10, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm10
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2
+; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa 656(%rbp), %ymm2
+; AVX1-NEXT: vpavgb %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
+; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 176(%rbp), %ymm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vmovdqa 688(%rbp), %ymm8
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13
+; AVX1-NEXT: vpavgb %xmm2, %xmm13, %xmm2
+; AVX1-NEXT: vpavgb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa 208(%rbp), %ymm8
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm13
+; AVX1-NEXT: vmovdqa 720(%rbp), %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm15
+; AVX1-NEXT: vpavgb %xmm1, %xmm15, %xmm1
+; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovdqa 240(%rbp), %ymm15
+; AVX1-NEXT: vmovdqa 752(%rbp), %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14
+; AVX1-NEXT: vpavgb %xmm2, %xmm14, %xmm2
+; AVX1-NEXT: vpavgb %xmm8, %xmm15, %xmm8
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2
+; AVX1-NEXT: vmovaps %ymm2, 480(%rdi)
+; AVX1-NEXT: vmovaps %ymm1, 448(%rdi)
+; AVX1-NEXT: vmovaps %ymm13, 416(%rdi)
+; AVX1-NEXT: vmovaps %ymm0, 384(%rdi)
+; AVX1-NEXT: vmovaps %ymm9, 352(%rdi)
+; AVX1-NEXT: vmovaps %ymm10, 320(%rdi)
+; AVX1-NEXT: vmovaps %ymm11, 288(%rdi)
+; AVX1-NEXT: vmovaps %ymm12, 256(%rdi)
+; AVX1-NEXT: vmovaps %ymm7, 224(%rdi)
+; AVX1-NEXT: vmovaps %ymm6, 192(%rdi)
+; AVX1-NEXT: vmovaps %ymm5, 160(%rdi)
+; AVX1-NEXT: vmovaps %ymm4, 128(%rdi)
+; AVX1-NEXT: vmovaps %ymm3, 96(%rdi)
+; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 64(%rdi)
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vmovaps %ymm0, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avg_v512i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8
+; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9
+; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10
+; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11
+; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12
+; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13
+; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14
+; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15
+; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
+; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
+; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
+; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
+; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
+; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
+; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
+; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
+; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
+; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
+; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
+; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
+; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
+; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
+; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
+; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
+; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi)
+; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi)
+; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi)
+; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi)
+; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi)
+; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi)
+; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi)
+; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi)
+; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi)
+; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi)
+; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi)
+; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi)
+; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: avg_v512i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
+; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
+; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
+; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
+; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
+; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
+; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
+; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
+; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
+; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
+; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
+; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
+; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
+; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
+; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
+; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
+; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
+; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
+; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
+; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
+; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
+; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
+; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: avg_v512i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $64, %rsp
+; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0
+; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1
+; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2
+; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3
+; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4
+; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5
+; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6
+; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7
+; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT: movq %rdi, %rax
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %za = zext <512 x i8> %a to <512 x i16>
+ %zb = zext <512 x i8> %b to <512 x i16>
+ %add = add nuw nsw <512 x i16> %za, %zb
+ %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %res = trunc <512 x i16> %lshr to <512 x i8>
+ ret <512 x i8> %res
+}
Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll?rev=326063&r1=326062&r2=326063&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll Sun Feb 25 18:16:31 2018
@@ -641,3 +641,179 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b
%res = bitcast <8 x i1> %x to i8
ret i8 %res
}
+
+define i64 @v16i8_widened_with_zeroes(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-SSSE3-LABEL: v16i8_widened_with_zeroes:
+; SSE2-SSSE3: # %bb.0: # %entry
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: andl $1, %eax
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: leal (%rcx,%rax,2), %eax
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: leal (%rax,%rcx,4), %eax
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: leal (%rax,%rcx,8), %eax
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: shll $4, %ecx
+; SSE2-SSSE3-NEXT: orl %eax, %ecx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: andl $1, %eax
+; SSE2-SSSE3-NEXT: shll $5, %eax
+; SSE2-SSSE3-NEXT: orl %ecx, %eax
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: shll $6, %ecx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT: andl $1, %edx
+; SSE2-SSSE3-NEXT: shll $7, %edx
+; SSE2-SSSE3-NEXT: orl %ecx, %edx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: shll $8, %ecx
+; SSE2-SSSE3-NEXT: orl %edx, %ecx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT: andl $1, %edx
+; SSE2-SSSE3-NEXT: shll $9, %edx
+; SSE2-SSSE3-NEXT: orl %ecx, %edx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: shll $10, %ecx
+; SSE2-SSSE3-NEXT: orl %edx, %ecx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT: andl $1, %edx
+; SSE2-SSSE3-NEXT: shll $11, %edx
+; SSE2-SSSE3-NEXT: orl %ecx, %edx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: shll $12, %ecx
+; SSE2-SSSE3-NEXT: orl %edx, %ecx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT: andl $1, %edx
+; SSE2-SSSE3-NEXT: shll $13, %edx
+; SSE2-SSSE3-NEXT: orl %ecx, %edx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: shll $14, %ecx
+; SSE2-SSSE3-NEXT: orl %edx, %ecx
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT: shll $15, %edx
+; SSE2-SSSE3-NEXT: orl %ecx, %edx
+; SSE2-SSSE3-NEXT: orl %eax, %edx
+; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: movw $0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT: movl %edx, %eax
+; SSE2-SSSE3-NEXT: shll $16, %eax
+; SSE2-SSSE3-NEXT: orl %eax, %edx
+; SSE2-SSSE3-NEXT: shlq $32, %rdx
+; SSE2-SSSE3-NEXT: orl %ecx, %eax
+; SSE2-SSSE3-NEXT: orq %rdx, %rax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v16i8_widened_with_zeroes:
+; AVX12: # %bb.0: # %entry
+; AVX12-NEXT: pushq %rbp
+; AVX12-NEXT: .cfi_def_cfa_offset 16
+; AVX12-NEXT: .cfi_offset %rbp, -16
+; AVX12-NEXT: movq %rsp, %rbp
+; AVX12-NEXT: .cfi_def_cfa_register %rbp
+; AVX12-NEXT: andq $-32, %rsp
+; AVX12-NEXT: subq $64, %rsp
+; AVX12-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpextrb $1, %xmm0, %eax
+; AVX12-NEXT: andl $1, %eax
+; AVX12-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: leal (%rcx,%rax,2), %eax
+; AVX12-NEXT: vpextrb $2, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: leal (%rax,%rcx,4), %eax
+; AVX12-NEXT: vpextrb $3, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: leal (%rax,%rcx,8), %eax
+; AVX12-NEXT: vpextrb $4, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: shll $4, %ecx
+; AVX12-NEXT: orl %eax, %ecx
+; AVX12-NEXT: vpextrb $5, %xmm0, %eax
+; AVX12-NEXT: andl $1, %eax
+; AVX12-NEXT: shll $5, %eax
+; AVX12-NEXT: orl %ecx, %eax
+; AVX12-NEXT: vpextrb $6, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: shll $6, %ecx
+; AVX12-NEXT: vpextrb $7, %xmm0, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: shll $7, %edx
+; AVX12-NEXT: orl %ecx, %edx
+; AVX12-NEXT: vpextrb $8, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: shll $8, %ecx
+; AVX12-NEXT: orl %edx, %ecx
+; AVX12-NEXT: vpextrb $9, %xmm0, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: shll $9, %edx
+; AVX12-NEXT: orl %ecx, %edx
+; AVX12-NEXT: vpextrb $10, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: shll $10, %ecx
+; AVX12-NEXT: orl %edx, %ecx
+; AVX12-NEXT: vpextrb $11, %xmm0, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: shll $11, %edx
+; AVX12-NEXT: orl %ecx, %edx
+; AVX12-NEXT: vpextrb $12, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: shll $12, %ecx
+; AVX12-NEXT: orl %edx, %ecx
+; AVX12-NEXT: vpextrb $13, %xmm0, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: shll $13, %edx
+; AVX12-NEXT: orl %ecx, %edx
+; AVX12-NEXT: vpextrb $14, %xmm0, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: shll $14, %ecx
+; AVX12-NEXT: orl %edx, %ecx
+; AVX12-NEXT: vpextrb $15, %xmm0, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: shll $15, %edx
+; AVX12-NEXT: orl %ecx, %edx
+; AVX12-NEXT: orl %eax, %edx
+; AVX12-NEXT: movl %edx, (%rsp)
+; AVX12-NEXT: movl $0, {{[0-9]+}}(%rsp)
+; AVX12-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; AVX12-NEXT: shlq $32, %rcx
+; AVX12-NEXT: movl (%rsp), %eax
+; AVX12-NEXT: orq %rcx, %rax
+; AVX12-NEXT: movq %rbp, %rsp
+; AVX12-NEXT: popq %rbp
+; AVX12-NEXT: retq
+;
+; AVX512F-LABEL: v16i8_widened_with_zeroes:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movzwl %ax, %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i8_widened_with_zeroes:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+entry:
+ %c = icmp eq <16 x i8> %a, %b
+ %d = shufflevector <16 x i1> %c, <16 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %e = bitcast <64 x i1> %d to i64
+ ret i64 %e
+}
More information about the llvm-commits
mailing list