[llvm] r368183 - Recommit r367901 "[X86] Enable -x86-experimental-vector-widening-legalization by default."
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 7 09:24:27 PDT 2019
Modified: llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128.ll Wed Aug 7 09:24:26 2019
@@ -20,157 +20,6 @@
define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: xorpd %xmm0, %xmm2
-; SSE2-NEXT: psubq %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: psrlq %xmm0, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm0, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: psubq %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: xorps %xmm5, %xmm5
-; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm5, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm3[1]
-; X32-SSE-NEXT: xorpd %xmm0, %xmm2
-; X32-SSE-NEXT: psubq %xmm0, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad %xmm2, %xmm3
@@ -189,100 +38,70 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i16:
+; SSE41-LABEL: var_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrad %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad %xmm1, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i16:
+; AVX1-LABEL: var_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i16:
+; AVX2-LABEL: var_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1-LABEL: var_shift_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2-LABEL: var_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0
; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i16:
+; AVX512-LABEL: var_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL-LABEL: var_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE-LABEL: var_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psrad %xmm2, %xmm3
@@ -300,167 +119,21 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
; X32-SSE-NEXT: movaps %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = ashr <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm2, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: psubq %xmm3, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOP-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i16> %a, %b
- ret <2 x i16> %shift
+ %shift = ashr <2 x i32> %a, %b
+ ret <2 x i32> %shift
}
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllw $8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psraw $8, %xmm3
; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
@@ -486,66 +159,57 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v8i8:
+; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
; SSE41-NEXT: psllw $4, %xmm2
; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: paddw %xmm2, %xmm4
-; SSE41-NEXT: psraw $15, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psraw $8, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v8i8:
+; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v8i8:
+; AVX2-LABEL: var_shift_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
@@ -554,21 +218,15 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOP-LABEL: var_shift_v8i8:
+; XOP-LABEL: var_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $8, %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ-LABEL: var_shift_v4i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
@@ -577,21 +235,17 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW-LABEL: var_shift_v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL-LABEL: var_shift_v4i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
@@ -599,27 +253,21 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
-; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL-LABEL: var_shift_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE-LABEL: var_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllw $8, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psraw $8, %xmm3
; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: pand %xmm0, %xmm2
-; X32-SSE-NEXT: pandn %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: paddw %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
@@ -644,138 +292,634 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = ashr <8 x i8> %a, %b
- ret <8 x i8> %shift
+ %shift = ashr <4 x i16> %a, %b
+ ret <4 x i16> %shift
}
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad %xmm4, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm1, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i8:
+; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psraw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i8:
+; AVX2-LABEL: var_shift_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: var_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $12, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = ashr <2 x i16> %a, %b
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $2, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psraw $1, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = ashr <8 x i8> %a, %b
+ ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad %xmm4, %xmm2
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm1, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm2, %xmm0
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $2, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psraw $1, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, %b
ret <4 x i8> %shift
@@ -784,136 +928,226 @@ define <4 x i8> @var_shift_v4i8(<4 x i8>
define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm4, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm4, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubq %xmm2, %xmm0
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: var_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: var_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $2, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psraw $1, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, %b
ret <2 x i8> %shift
@@ -926,154 +1160,46 @@ define <2 x i8> @var_shift_v2i8(<2 x i8>
define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: xorpd %xmm1, %xmm2
-; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: psrad %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm4
-; SSE41-NEXT: psrlq %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: psubq %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: psrad %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: psrlq %xmm0, %xmm4
-; X32-SSE-NEXT: xorps %xmm5, %xmm5
-; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm5, %xmm3
-; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: psrlq %xmm5, %xmm1
-; X32-SSE-NEXT: psrlq %xmm0, %xmm2
-; X32-SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
-; X32-SSE-NEXT: xorpd %xmm3, %xmm2
-; X32-SSE-NEXT: psubq %xmm3, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
+; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT: psrad %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i32> %a, %splat
@@ -1083,146 +1209,46 @@ define <2 x i32> @splatvar_shift_v2i32(<
define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm1, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrad %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psraw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm1, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psraw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i16> %a, %splat
@@ -1232,366 +1258,188 @@ define <4 x i16> @splatvar_shift_v4i16(<
define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm2, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: psubq %xmm3, %xmm0
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psraw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psraw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i16> %a, %splat
ret <2 x i16> %shift
}
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllw $8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psraw $8, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $2, %xmm0
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: paddw %xmm2, %xmm4
-; SSE41-NEXT: psraw $15, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOP-LABEL: splatvar_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $8, %xmm0, %xmm0
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllw $8, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psraw $8, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: pand %xmm0, %xmm2
-; X32-SSE-NEXT: pandn %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: pxor %xmm2, %xmm0
+; X32-SSE-NEXT: psubb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
%shift = ashr <8 x i8> %a, %splat
@@ -1601,133 +1449,139 @@ define <8 x i8> @splatvar_shift_v8i8(<8
define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm1, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm1, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: pxor %xmm2, %xmm0
+; X32-SSE-NEXT: psubb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i8> %a, %splat
@@ -1737,138 +1591,130 @@ define <4 x i8> @splatvar_shift_v4i8(<4
define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm4, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm4, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubq %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: pxor %xmm2, %xmm0
+; X32-SSE-NEXT: psubb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i8> %a, %splat
@@ -1882,100 +1728,62 @@ define <2 x i8> @splatvar_shift_v2i8(<2
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $4, %xmm1
+; SSE2-NEXT: psrad $5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $4, %xmm0
-; SSE2-NEXT: psrlq $5, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm0 = [576460752303423488,288230376151711744]
-; SSE2-NEXT: xorpd %xmm0, %xmm1
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $5, %xmm0
-; SSE41-NEXT: psrlq $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [576460752303423488,288230376151711744]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: psubq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrad $5, %xmm1
+; SSE41-NEXT: psrad $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOP-LABEL: constant_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrad $4, %xmm1
+; X32-SSE-NEXT: psrad $5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psrlq $4, %xmm0
-; X32-SSE-NEXT: psrlq $5, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: movapd {{.*#+}} xmm0 = [3.7857669957336791E-270,2.0522684006491881E-289]
-; X32-SSE-NEXT: xorpd %xmm0, %xmm1
-; X32-SSE-NEXT: psubq %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
@@ -1984,96 +1792,83 @@ define <2 x i32> @constant_shift_v2i32(<
define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $19, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $18, %xmm3
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: psrad $17, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: psraw $1, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $19, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad $17, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrad $18, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT: pmulhw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT: psraw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $19, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $17, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $18, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $16, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $19, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad $18, %xmm3
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; X32-SSE-NEXT: psrad $17, %xmm0
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; X32-SSE-NEXT: psraw $2, %xmm1
+; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: andps %xmm2, %xmm0
+; X32-SSE-NEXT: psraw $1, %xmm1
+; X32-SSE-NEXT: andnps %xmm1, %xmm2
+; X32-SSE-NEXT: orps %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
@@ -2082,425 +1877,336 @@ define <4 x i16> @constant_shift_v4i16(<
define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE2-NEXT: xorpd %xmm1, %xmm0
-; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: psraw $3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: psubq %xmm1, %xmm0
+; SSE41-NEXT: psraw $3, %xmm1
+; SSE41-NEXT: psraw $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: constant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [1.4916681462400413E-154,1.2882297539194267E-231]
-; X32-SSE-NEXT: xorpd %xmm1, %xmm0
-; X32-SSE-NEXT: psubq %xmm1, %xmm0
+; X32-SSE-NEXT: psraw $3, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i16> %a, <i16 2, i16 3>
ret <2 x i16> %shift
}
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: psraw $12, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: andps %xmm2, %xmm0
-; SSE2-NEXT: psraw $1, %xmm1
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: orps %xmm2, %xmm0
-; SSE2-NEXT: retq
+; SSE-LABEL: constant_shift_v8i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
-; SSE41-LABEL: constant_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; SSE41-NEXT: pmulhw %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psraw $9, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: retq
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
;
-; AVX-LABEL: constant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT: vpsraw $9, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: retq
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $8, %xmm0, %xmm0
-; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $8, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $8, %xmm1
-; X32-SSE-NEXT: psraw $12, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: movapd %xmm0, %xmm1
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
-; X32-SSE-NEXT: andps %xmm2, %xmm0
-; X32-SSE-NEXT: psraw $1, %xmm1
-; X32-SSE-NEXT: andnps %xmm1, %xmm2
-; X32-SSE-NEXT: orps %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
}
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $27, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $26, %xmm3
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: psrad $25, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $27, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad $25, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrad $26, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_shift_v4i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $27, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $25, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $26, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $24, %xmm1
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $27, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad $26, %xmm3
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; X32-SSE-NEXT: psrad $25, %xmm0
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
}
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE2-NEXT: xorpd %xmm1, %xmm0
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: psubq %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_shift_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [1.4916681462400413E-154,1.2882297539194267E-231]
-; X32-SSE-NEXT: xorpd %xmm1, %xmm0
-; X32-SSE-NEXT: psubq %xmm1, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
@@ -2511,88 +2217,34 @@ define <2 x i8> @constant_shift_v2i8(<2
;
define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrad $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrlq $5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrad $5, %xmm0
-; SSE41-NEXT: psrlq $5, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: psrad $5, %xmm0
+; SSE-NEXT: retq
;
-; AVX2-LABEL: splatconstant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrad $5, %xmm0, %xmm1
-; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrad $5, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i32:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpsrad $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $37, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $37, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: psrad $5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrlq $5, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE-NEXT: retl
%shift = ashr <2 x i32> %a, <i32 5, i32 5>
ret <2 x i32> %shift
@@ -2601,135 +2253,66 @@ define <2 x i32> @splatconstant_shift_v2
define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $19, %xmm0
+; SSE-NEXT: psraw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $19, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $16, %xmm0, %xmm0
-; XOP-NEXT: vpsrad $19, %xmm0, %xmm0
+; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $19, %xmm0, %xmm0
+; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $19, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
-; X32-SSE-NEXT: psrad $19, %xmm0
+; X32-SSE-NEXT: psraw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
ret <4 x i16> %shift
}
define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE: # %bb.0:
+; SSE-NEXT: psraw $3, %xmm0
+; SSE-NEXT: retq
;
-; AVX2-LABEL: splatconstant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $51, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $51, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psraw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i16> %a, <i16 3, i16 3>
ret <2 x i16> %shift
@@ -2738,38 +2321,52 @@ define <2 x i16> @splatconstant_shift_v2
define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i8:
; SSE: # %bb.0:
-; SSE-NEXT: psllw $8, %xmm0
-; SSE-NEXT: psraw $11, %xmm0
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $11, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $11, %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsraw $11, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraw $11, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $8, %xmm0
-; X32-SSE-NEXT: psraw $11, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: pxor %xmm1, %xmm0
+; X32-SSE-NEXT: psubb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <8 x i8> %shift
@@ -2778,135 +2375,106 @@ define <8 x i8> @splatconstant_shift_v8i
define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $24, %xmm0
-; SSE-NEXT: psrad $27, %xmm0
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $27, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $24, %xmm0, %xmm0
-; XOP-NEXT: vpsrad $27, %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $27, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $27, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: psrad $27, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: pxor %xmm1, %xmm0
+; X32-SSE-NEXT: psubb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
ret <4 x i8> %shift
}
define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: retq
;
-; AVX2-LABEL: splatconstant_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $59, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $59, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: pxor %xmm1, %xmm0
+; X32-SSE-NEXT: psubb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, <i8 3, i8 3>
ret <2 x i8> %shift
Modified: llvm/trunk/test/CodeGen/X86/vector-shift-by-select-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-by-select-loop.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-by-select-loop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-by-select-loop.ll Wed Aug 7 09:24:26 2019
@@ -29,91 +29,87 @@ define void @vector_variable_shift_left_
; SSE-NEXT: movd %r9d, %xmm0
; SSE-NEXT: movd %r8d, %xmm1
; SSE-NEXT: xorl %ecx, %ecx
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
; SSE-NEXT: .p2align 4, 0x90
; SSE-NEXT: .LBB0_4: # %vector.body
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqw %xmm1, %xmm0
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-NEXT: pslld $24, %xmm12
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE-NEXT: pslld $24, %xmm0
-; SSE-NEXT: pcmpeqw %xmm1, %xmm3
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE-NEXT: pslld $24, %xmm11
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pslld $24, %xmm3
-; SSE-NEXT: pcmpeqw %xmm1, %xmm2
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE-NEXT: pslld $24, %xmm9
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: pslld $24, %xmm2
-; SSE-NEXT: pcmpeqw %xmm1, %xmm15
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; SSE-NEXT: pslld $24, %xmm8
-; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE-NEXT: pslld $24, %xmm15
-; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm6
-; SSE-NEXT: movdqa %xmm6, %xmm4
-; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm13, %xmm6
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm6
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pmovsxbd %xmm0, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm0, %xmm0
+; SSE-NEXT: pcmpeqb %xmm1, %xmm3
+; SSE-NEXT: pmovsxbd %xmm3, %xmm13
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm3, %xmm6
+; SSE-NEXT: pcmpeqb %xmm1, %xmm4
+; SSE-NEXT: pmovsxbd %xmm4, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm3, %xmm2
+; SSE-NEXT: pcmpeqb %xmm1, %xmm5
+; SSE-NEXT: pmovsxbd %xmm5, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm3, %xmm9
+; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pslld %xmm15, %xmm4
+; SSE-NEXT: pslld %xmm14, %xmm3
+; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3
; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm10
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm13, %xmm10
-; SSE-NEXT: movdqa %xmm12, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm10
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm10
+; SSE-NEXT: movdqa %xmm7, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm10
; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12
; SSE-NEXT: movdqa %xmm12, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm12
-; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm12
+; SSE-NEXT: movdqa %xmm6, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12
-; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm3
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm3
+; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm6
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6
; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm1
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: pslld %xmm15, %xmm2
; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm2
+; SSE-NEXT: movdqa %xmm11, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5
+; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pslld %xmm15, %xmm4
+; SSE-NEXT: pslld %xmm14, %xmm2
; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm2
-; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm7
-; SSE-NEXT: pslld %xmm14, %xmm7
-; SSE-NEXT: pslld %xmm13, %xmm5
-; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm5
-; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm4
+; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: pslld %xmm15, %xmm7
; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm13, %xmm7
; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm7
+; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4
; SSE-NEXT: movups %xmm10, (%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm6, 16(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm3, 32(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4)
; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm2, 64(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4)
; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm7, 96(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm5, 112(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4)
; SSE-NEXT: addq $32, %rcx
; SSE-NEXT: cmpq %rcx, %rdx
; SSE-NEXT: jne .LBB0_4
@@ -179,33 +175,33 @@ define void @vector_variable_shift_left_
; AVX1-NEXT: # xmm1 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm3, %xmm3
-; AVX1-NEXT: vpmovsxwd %xmm3, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm4, %xmm4
-; AVX1-NEXT: vpmovsxwd %xmm4, %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm5, %xmm5
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm4, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm5, %xmm5
; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9
; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm10
; AVX1-NEXT: vpslld %xmm1, %xmm9, %xmm0
; AVX1-NEXT: vblendvps %xmm7, %xmm10, %xmm0, %xmm9
-; AVX1-NEXT: vpmovsxwd %xmm5, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm6, %xmm6
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm6, %xmm6
; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0
; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm6, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm6, %xmm6
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm10
; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2
; AVX1-NEXT: vpslld %xmm15, %xmm2, %xmm3
Modified: llvm/trunk/test/CodeGen/X86/vector-shift-lshr-sub128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-lshr-sub128.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-lshr-sub128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-lshr-sub128.ll Wed Aug 7 09:24:26 2019
@@ -20,103 +20,6 @@
define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: xorps %xmm3, %xmm3
-; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrld %xmm2, %xmm3
@@ -135,93 +38,70 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i16:
+; SSE41-LABEL: var_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrld %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrld %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm1, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i16:
+; AVX1-LABEL: var_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i16:
+; AVX2-LABEL: var_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1-LABEL: var_shift_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2-LABEL: var_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i16:
+; AVX512-LABEL: var_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL-LABEL: var_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE-LABEL: var_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm1
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psrld %xmm2, %xmm3
@@ -239,111 +119,21 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
; X32-SSE-NEXT: movaps %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = lshr <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i16> %a, %b
- ret <2 x i16> %shift
+ %shift = lshr <2 x i32> %a, %b
+ ret <2 x i32> %shift
}
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
@@ -369,20 +159,18 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v8i8:
+; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
; SSE41-NEXT: psllw $4, %xmm2
; SSE41-NEXT: por %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
@@ -402,16 +190,13 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v8i8:
+; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -423,11 +208,8 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v8i8:
+; AVX2-LABEL: var_shift_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
@@ -437,31 +219,15 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v8i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v8i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: var_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ-LABEL: var_shift_v4i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
@@ -470,21 +236,17 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW-LABEL: var_shift_v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL-LABEL: var_shift_v4i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
@@ -492,22 +254,21 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
-; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL-LABEL: var_shift_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE-LABEL: var_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
-; X32-SSE-NEXT: pandn %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: paddw %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
@@ -532,138 +293,479 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = lshr <8 x i8> %a, %b
- ret <8 x i8> %shift
+ %shift = lshr <4 x i16> %a, %b
+ ret <4 x i16> %shift
}
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrld %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm4, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm1, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i8:
+; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i8:
+; AVX2-LABEL: var_shift_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: var_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $12, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: psrlw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = lshr <2 x i16> %a, %b
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = lshr <8 x i8> %a, %b
+ ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrld %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm4, %xmm2
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm1, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, %b
ret <4 x i8> %shift
@@ -672,93 +774,148 @@ define <4 x i8> @var_shift_v4i8(<4 x i8>
define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: var_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: var_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: var_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; XOPAVX1-NEXT: # xmm2 = mem[0,0]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, %b
ret <2 x i8> %shift
@@ -771,99 +928,46 @@ define <2 x i8> @var_shift_v2i8(<2 x i8>
define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: psrld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: psrld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm2, %xmm3
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm2, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: psrld %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
%shift = lshr <2 x i32> %a, %splat
@@ -873,139 +977,46 @@ define <2 x i32> @splatvar_shift_v2i32(<
define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm1, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrld %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrld %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm1, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
%shift = lshr <4 x i16> %a, %splat
@@ -1015,302 +1026,168 @@ define <4 x i16> @splatvar_shift_v4i16(<
define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: splatvar_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
- %shift = lshr <2 x i16> %a, %splat
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+ %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+ %shift = lshr <2 x i16> %a, %splat
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v8i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v8i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
-; X32-SSE-NEXT: pandn %xmm2, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psrlw $1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
%shift = lshr <8 x i8> %a, %splat
@@ -1320,132 +1197,119 @@ define <8 x i8> @splatvar_shift_v8i8(<8
define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm1, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm1, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
%shift = lshr <4 x i8> %a, %splat
@@ -1455,94 +1319,110 @@ define <4 x i8> @splatvar_shift_v4i8(<4
define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: splatvar_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; XOPAVX1-NEXT: # xmm2 = mem[0,0]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
%shift = lshr <2 x i8> %a, %splat
@@ -1556,75 +1436,62 @@ define <2 x i8> @splatvar_shift_v2i8(<2
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: psrlq $5, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psrld $4, %xmm1
+; SSE2-NEXT: psrld $5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $5, %xmm0
-; SSE41-NEXT: psrlq $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $5, %xmm1
+; SSE41-NEXT: psrld $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v2i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $4, %xmm1
-; X32-SSE-NEXT: psrlq $5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: psrld $4, %xmm1
+; X32-SSE-NEXT: psrld $5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
@@ -1633,91 +1500,66 @@ define <2 x i32> @constant_shift_v2i32(<
define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $3, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm2
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrld $3, %xmm1
-; SSE41-NEXT: psrld $1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: constant_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $3, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld $2, %xmm2
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $1, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
@@ -1726,75 +1568,72 @@ define <4 x i16> @constant_shift_v4i16(<
define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psrlw $3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: psrlq $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $3, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: psrlw $3, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i16> %a, <i16 2, i16 3>
ret <2 x i16> %shift
@@ -1803,74 +1642,94 @@ define <2 x i16> @constant_shift_v2i16(<
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
-; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: constant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: retq
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
@@ -1879,85 +1738,94 @@ define <8 x i8> @constant_shift_v8i8(<8
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $3, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm2
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrld $3, %xmm0
-; SSE41-NEXT: psrld $1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $3, %xmm1
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld $2, %xmm2
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $1, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
@@ -1966,67 +1834,94 @@ define <4 x i8> @constant_shift_v4i8(<4
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
@@ -2037,167 +1932,102 @@ define <2 x i8> @constant_shift_v2i8(<2
;
define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlq $5, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: psrlq $5, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatconstant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: retq
;
-; XOPAVX1-LABEL: splatconstant_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; XOPAVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatconstant_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; XOPAVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpsrld $5, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512-NEXT: vpsrlq $5, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vpsrlq $5, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlq $5, %xmm0
+; X32-SSE-NEXT: psrld $5, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i32> %a, <i32 5, i32 5>
ret <2 x i32> %shift
}
define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrld $3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: psrld $3, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrld $3, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
ret <4 x i16> %shift
}
define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlq $3, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i16> %a, <i16 3, i16 3>
ret <2 x i16> %shift
@@ -2206,38 +2036,37 @@ define <2 x i16> @splatconstant_shift_v2
define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i8:
; SSE: # %bb.0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <8 x i8> %shift
@@ -2246,38 +2075,37 @@ define <8 x i8> @splatconstant_shift_v8i
define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i8:
; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrld $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrld $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
ret <4 x i8> %shift
@@ -2286,38 +2114,37 @@ define <4 x i8> @splatconstant_shift_v4i
define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i8:
; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrlq $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlq $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, <i8 3, i8 3>
ret <2 x i8> %shift
Modified: llvm/trunk/test/CodeGen/X86/vector-shift-shl-sub128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-shl-sub128.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-shl-sub128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-shl-sub128.ll Wed Aug 7 09:24:26 2019
@@ -20,88 +20,6 @@
define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: xorps %xmm3, %xmm3
-; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
-; X32-SSE-NEXT: psllq %xmm3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
@@ -114,7 +32,7 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i16:
+; SSE41-LABEL: var_shift_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
@@ -122,7 +40,7 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i16:
+; AVX1-LABEL: var_shift_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
@@ -130,42 +48,32 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i16:
+; AVX2-LABEL: var_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1-LABEL: var_shift_v2i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2-LABEL: var_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i16:
+; AVX512-LABEL: var_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL-LABEL: var_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE-LABEL: var_shift_v2i32:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pslld $23, %xmm1
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
@@ -178,125 +86,168 @@ define <4 x i16> @var_shift_v4i16(<4 x i
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE-NEXT: retl
- %shift = shl <4 x i16> %a, %b
- ret <4 x i16> %shift
+ %shift = shl <2 x i32> %a, %b
+ ret <2 x i32> %shift
}
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v2i16:
+; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT: packusdw %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i16:
+; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v2i16:
+; AVX2-LABEL: var_shift_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: var_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v2i16:
+; AVX512BWVL-LABEL: var_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT: pslld $23, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm3, %xmm2
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE-NEXT: pslld $23, %xmm1
+; X32-SSE-NEXT: paddd %xmm3, %xmm1
+; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE-NEXT: pmullw %xmm1, %xmm0
; X32-SSE-NEXT: retl
- %shift = shl <2 x i16> %a, %b
- ret <2 x i16> %shift
+ %shift = shl <4 x i16> %a, %b
+ ret <4 x i16> %shift
}
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v8i8:
+; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd %xmm3, %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
@@ -304,15 +255,14 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v8i8:
+; AVX1-LABEL: var_shift_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
@@ -320,11 +270,10 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v8i8:
+; AVX2-LABEL: var_shift_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -332,70 +281,209 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOP-LABEL: var_shift_v8i8:
+; XOP-LABEL: var_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ-LABEL: var_shift_v2i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW-LABEL: var_shift_v2i16:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL-LABEL: var_shift_v2i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
-; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL-LABEL: var_shift_v2i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE-LABEL: var_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; X32-SSE-NEXT: pslld $23, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm4, %xmm3
-; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT: pslld $23, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm3, %xmm2
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd %xmm4, %xmm1
+; X32-SSE-NEXT: paddd %xmm3, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X32-SSE-NEXT: pmullw %xmm1, %xmm0
; X32-SSE-NEXT: retl
+ %shift = shl <2 x i16> %a, %b
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%shift = shl <8 x i8> %a, %b
ret <8 x i8> %shift
}
@@ -403,80 +491,142 @@ define <8 x i8> @var_shift_v8i8(<8 x i8>
define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
+; AVX-LABEL: var_shift_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: var_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i8> %a, %b
ret <4 x i8> %shift
@@ -485,71 +635,142 @@ define <4 x i8> @var_shift_v4i8(<4 x i8>
define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: var_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: var_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: var_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i8> %a, %b
ret <2 x i8> %shift
@@ -562,88 +783,46 @@ define <2 x i8> @var_shift_v2i8(<2 x i8>
define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: pslld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: pslld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psllq %xmm2, %xmm3
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: psllq %xmm2, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: pslld %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
%shift = shl <2 x i32> %a, %splat
@@ -653,90 +832,46 @@ define <2 x i32> @splatvar_shift_v2i32(<
define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
%shift = shl <4 x i16> %a, %splat
@@ -746,87 +881,46 @@ define <4 x i16> @splatvar_shift_v4i16(<
define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
%shift = shl <2 x i16> %a, %splat
@@ -836,138 +930,114 @@ define <2 x i16> @splatvar_shift_v2i16(<
define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psllw %xmm1, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: packusdw %xmm1, %xmm2
-; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOP-LABEL: splatvar_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psllw %xmm1, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; X32-SSE-NEXT: pslld $23, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm4, %xmm3
-; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd %xmm4, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X32-SSE-NEXT: pmullw %xmm1, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
%shift = shl <8 x i8> %a, %splat
@@ -977,82 +1047,114 @@ define <8 x i8> @splatvar_shift_v8i8(<8
define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i8:
; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psllw %xmm1, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psllw %xmm1, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
%shift = shl <4 x i8> %a, %splat
@@ -1062,73 +1164,107 @@ define <4 x i8> @splatvar_shift_v4i8(<4
define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psllw %xmm1, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: splatvar_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psllw %xmm1, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
%shift = shl <2 x i8> %a, %splat
@@ -1143,120 +1279,109 @@ define <2 x i32> @constant_shift_v2i32(<
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $4, %xmm1
-; SSE2-NEXT: psllq $5, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: pslld $4, %xmm1
+; SSE2-NEXT: pslld $5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $5, %xmm1
-; SSE41-NEXT: psllq $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pslld $5, %xmm1
+; SSE41-NEXT: pslld $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpslld $5, %xmm0, %xmm1
+; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v2i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i32:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $4, %xmm1
-; X32-SSE-NEXT: psllq $5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: pslld $4, %xmm1
+; X32-SSE-NEXT: pslld $5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
}
define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_shift_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
@@ -1265,106 +1390,149 @@ define <4 x i16> @constant_shift_v4i16(<
define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $2, %xmm1
-; SSE2-NEXT: psllq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $3, %xmm1
-; SSE41-NEXT: psllq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: psllw $3, %xmm1
+; SSE41-NEXT: psllw $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $2, %xmm1
-; X32-SSE-NEXT: psllq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i16> %a, <i16 2, i16 3>
ret <2 x i16> %shift
}
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE-LABEL: constant_shift_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: constant_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: constant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE41-LABEL: constant_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: packuswb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
@@ -1373,61 +1541,87 @@ define <8 x i8> @constant_shift_v8i8(<8
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: packuswb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
@@ -1436,58 +1630,87 @@ define <4 x i8> @constant_shift_v4i8(<4
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $2, %xmm1
-; SSE2-NEXT: psllq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $3, %xmm1
-; SSE41-NEXT: psllq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $2, %xmm1
-; X32-SSE-NEXT: psllq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: packuswb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
@@ -1500,32 +1723,32 @@ define <2 x i8> @constant_shift_v2i8(<2
define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: psllq $5, %xmm0
+; SSE-NEXT: pslld $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllq $5, %xmm0, %xmm0
+; AVX-NEXT: vpslld $5, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i32:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $5, %xmm0, %xmm0
+; XOP-NEXT: vpslld $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $5, %xmm0, %xmm0
+; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $5, %xmm0, %xmm0
+; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $5, %xmm0
+; X32-SSE-NEXT: pslld $5, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i32> %a, <i32 5, i32 5>
ret <2 x i32> %shift
@@ -1534,32 +1757,32 @@ define <2 x i32> @splatconstant_shift_v2
define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $3, %xmm0, %xmm0
+; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
ret <4 x i16> %shift
@@ -1568,32 +1791,32 @@ define <4 x i16> @splatconstant_shift_v4
define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: psllq $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $3, %xmm0, %xmm0
+; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i16> %a, <i16 3, i16 3>
ret <2 x i16> %shift
@@ -1603,31 +1826,36 @@ define <8 x i8> @splatconstant_shift_v8i
; SSE-LABEL: splatconstant_shift_v8i8:
; SSE: # %bb.0:
; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i8:
; AVX: # %bb.0:
; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <8 x i8> %shift
@@ -1636,32 +1864,37 @@ define <8 x i8> @splatconstant_shift_v8i
define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
ret <4 x i8> %shift
@@ -1670,32 +1903,37 @@ define <4 x i8> @splatconstant_shift_v4i
define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i8:
; SSE: # %bb.0:
-; SSE-NEXT: psllq $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i8> %a, <i8 3, i8 3>
ret <2 x i8> %shift
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll Wed Aug 7 09:24:26 2019
@@ -841,26 +841,14 @@ entry:
}
define <16 x i8> @PR20540(<8 x i8> %a) {
-; SSE2-LABEL: PR20540:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: PR20540:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: PR20540:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
+; SSE-LABEL: PR20540:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT: retq
;
; AVX-LABEL: PR20540:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i8> %shuffle
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Wed Aug 7 09:24:26 2019
@@ -1715,41 +1715,37 @@ define <4 x i8> @combine_test1c(<4 x i8>
; SSE2-LABEL: combine_test1c:
; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test1c:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test1c:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test1c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test1c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test1c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1758,40 +1754,18 @@ define <4 x i8> @combine_test1c(<4 x i8>
}
define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: combine_test2c:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: combine_test2c:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: combine_test2c:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_test2c:
+; SSE: # %bb.0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_test2c:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
@@ -1801,40 +1775,20 @@ define <4 x i8> @combine_test2c(<4 x i8>
}
define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: combine_test3c:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: combine_test3c:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: combine_test3c:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_test3c:
+; SSE: # %bb.0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_test3c:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
@@ -1846,48 +1800,38 @@ define <4 x i8> @combine_test3c(<4 x i8>
define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test4c:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test4c:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test4c:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test4c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test4c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test4c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll Wed Aug 7 09:24:26 2019
@@ -1398,6 +1398,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: packuswb %xmm3, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
@@ -1504,6 +1505,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: packuswb %xmm3, %xmm2
; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8:
@@ -1597,6 +1599,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
; SSE41-NEXT: packusdw %xmm4, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -1625,25 +1628,33 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1652,7 +1663,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
@@ -2018,14 +2029,16 @@ define void @trunc_packus_v8i64_v8i8_sto
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -2745,84 +2758,25 @@ define <16 x i8> @trunc_packus_v16i64_v1
}
define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_packus_v8i32_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i32_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i32_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_packus_v8i32_v8i8:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2832,8 +2786,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2842,7 +2795,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -2852,8 +2805,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -2862,7 +2814,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -2905,8 +2857,7 @@ define void @trunc_packus_v8i32_v8i8_sto
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -2925,8 +2876,7 @@ define void @trunc_packus_v8i32_v8i8_sto
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll Wed Aug 7 09:24:26 2019
@@ -1165,7 +1165,7 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
; SSE2-NEXT: movdqa %xmm9, %xmm7
@@ -1176,23 +1176,10 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; SSE2-NEXT: pand %xmm10, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
@@ -1202,85 +1189,104 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; SSE2-NEXT: pand %xmm10, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: por %xmm0, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm7
; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: por %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm8, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm7, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v8i64_v8i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
; SSSE3-NEXT: movdqa %xmm9, %xmm7
@@ -1291,23 +1297,10 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; SSSE3-NEXT: pand %xmm10, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
@@ -1317,129 +1310,148 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; SSSE3-NEXT: pand %xmm10, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm0, %xmm7
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
; SSSE3-NEXT: movdqa %xmm7, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm7
; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm2, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm5, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm5
; SSSE3-NEXT: pandn %xmm8, %xmm1
; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: packssdw %xmm3, %xmm1
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm3, %xmm7
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: packuswb %xmm7, %xmm0
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v8i64_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127]
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
+; SSE41-NEXT: movdqa %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movapd %xmm11, %xmm10
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movapd %xmm11, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; SSE41-NEXT: movdqa %xmm6, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
+; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
; SSE41-NEXT: movapd %xmm11, %xmm1
; SSE41-NEXT: xorpd %xmm5, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840]
@@ -1449,86 +1461,111 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6
; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm6
+; SSE41-NEXT: xorpd %xmm5, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packssdw %xmm6, %xmm1
-; SSE41-NEXT: movapd %xmm9, %xmm2
-; SSE41-NEXT: xorpd %xmm5, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm6
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT: movapd %xmm10, %xmm3
+; SSE41-NEXT: xorpd %xmm5, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm6
; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2
-; SSE41-NEXT: xorpd %xmm8, %xmm5
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
+; SSE41-NEXT: xorpd %xmm9, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packssdw %xmm2, %xmm3
-; SSE41-NEXT: packssdw %xmm3, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2
+; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE41-NEXT: andpd %xmm0, %xmm2
+; SSE41-NEXT: andpd %xmm0, %xmm3
+; SSE41-NEXT: packusdw %xmm2, %xmm3
+; SSE41-NEXT: andpd %xmm0, %xmm7
+; SSE41-NEXT: andpd %xmm0, %xmm1
+; SSE41-NEXT: packusdw %xmm7, %xmm1
+; SSE41-NEXT: packusdw %xmm3, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_ssat_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7
+; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11
+; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8
+; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0
+; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1
+; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ssat_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1536,7 +1573,7 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
@@ -1948,14 +1985,16 @@ define void @trunc_ssat_v8i64_v8i8_store
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -2707,92 +2746,25 @@ define <16 x i8> @trunc_ssat_v16i64_v16i
}
define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_ssat_v8i32_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i32_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i32_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168]
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packssdw %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_ssat_v8i32_v8i8:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ssat_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ssat_v8i32_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2802,8 +2774,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -2811,7 +2782,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -2821,8 +2792,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -2830,7 +2800,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -2873,8 +2843,7 @@ define void @trunc_ssat_v8i32_v8i8_store
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -2891,8 +2860,7 @@ define void @trunc_ssat_v8i32_v8i8_store
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll Wed Aug 7 09:24:26 2019
@@ -946,55 +946,56 @@ define <16 x i16> @trunc_usat_v16i32_v16
define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i8:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm2, %xmm6
; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm1
@@ -1003,59 +1004,61 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: packuswb %xmm4, %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i64_v8i8:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: packuswb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: packuswb %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm6, %xmm1
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm2, %xmm6
; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSSE3-NEXT: pand %xmm3, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm1
@@ -1064,6 +1067,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: packuswb %xmm4, %xmm1
; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i64_v8i8:
@@ -1116,6 +1120,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
; SSE41-NEXT: packusdw %xmm5, %xmm9
; SSE41-NEXT: packusdw %xmm9, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -1141,6 +1146,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1148,24 +1154,31 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_usat_v8i64_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
@@ -1394,14 +1407,16 @@ define void @trunc_usat_v8i64_v8i8_store
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1850,59 +1865,70 @@ define <16 x i8> @trunc_usat_v16i64_v16i
define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) {
; SSE2-LABEL: trunc_usat_v8i32_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: pandn %xmm3, %xmm6
; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packuswb %xmm6, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: packuswb %xmm6, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i32_v8i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm3, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm1
+; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: packuswb %xmm6, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm1, %xmm5
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i32_v8i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm2, %xmm1
; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: pminud %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm2, %xmm1
+; SSE41-NEXT: pshufb %xmm2, %xmm0
+; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1911,7 +1937,10 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1919,15 +1948,14 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_usat_v8i32_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -1935,15 +1963,14 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -1992,32 +2019,35 @@ define void @trunc_usat_v8i32_v8i8_store
; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm5
; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm0, %xmm5
; SSSE3-NEXT: pshufb %xmm0, %xmm6
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; SSSE3-NEXT: movq %xmm6, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i32_v8i8_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm2, %xmm1
; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm0
+; SSE41-NEXT: pminud %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm2, %xmm1
+; SSE41-NEXT: pshufb %xmm2, %xmm0
+; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i32_v8i8_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -2027,8 +2057,10 @@ define void @trunc_usat_v8i32_v8i8_store
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -2037,8 +2069,7 @@ define void @trunc_usat_v8i32_v8i8_store
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -2053,8 +2084,7 @@ define void @trunc_usat_v8i32_v8i8_store
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Wed Aug 7 09:24:26 2019
@@ -296,32 +296,22 @@ define void @trunc8i64_8i8(<8 x i64> %a)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: trunc8i64_8i8:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc8i64_8i8:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: trunc8i64_8i8:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i8:
; AVX512: # %bb.0: # %entry
@@ -587,9 +577,11 @@ define void @trunc8i32_8i8(<8 x i32> %a)
;
; AVX2-LABEL: trunc8i32_8i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -597,8 +589,7 @@ define void @trunc8i32_8i8(<8 x i32> %a)
; AVX512F-LABEL: trunc8i32_8i8:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -612,8 +603,7 @@ define void @trunc8i32_8i8(<8 x i32> %a)
; AVX512BW-LABEL: trunc8i32_8i8:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1480,39 +1470,53 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
;
; AVX1-LABEL: trunc2x4i64_8i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc2x4i64_8i16:
; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
@@ -1520,22 +1524,16 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i64_8i16:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -1544,21 +1542,17 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm2
-; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2w %xmm1, %xmm2, %xmm0
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/vector-truncate-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-truncate-combine.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-truncate-combine.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-truncate-combine.ll Wed Aug 7 09:24:26 2019
@@ -14,12 +14,8 @@
; NOTE: This operation could be collapsed in to a single truncate. Once that is done
; this test will have to be adjusted.
-; CHECK: PUNPCKLBWrr
-; CHECK: PUNPCKLWDrr
; CHECK: PANDrm
; CHECK: PACKUSWBrr
-; CHECK: PACKUSWBrr
-; CHECK: PACKUSWBrr
; CHECK: MOVPDI2DIrr
define void @test(double %vec.coerce) local_unnamed_addr {
Modified: llvm/trunk/test/CodeGen/X86/vector-zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-zext.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-zext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-zext.ll Wed Aug 7 09:24:26 2019
@@ -397,16 +397,15 @@ entry:
define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -414,15 +413,13 @@ define <8 x i64> @zext_16i8_to_8i64(<16
;
; SSSE3-LABEL: zext_16i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i64:
@@ -1159,16 +1156,15 @@ entry:
define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -1176,16 +1172,14 @@ define <8 x i64> @load_zext_8i8_to_8i64(
;
; SSSE3-LABEL: load_zext_8i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i64:
@@ -1508,8 +1502,8 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x
; SSE2-LABEL: zext_8i8_to_8i32:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1518,8 +1512,8 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x
; SSSE3-LABEL: zext_8i8_to_8i32:
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1527,32 +1521,28 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x
;
; SSE41-LABEL: zext_8i8_to_8i32:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i8_to_8i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i8_to_8i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i8_to_8i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
%t = zext <8 x i8> %z to <8 x i32>
@@ -1659,8 +1649,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(
; SSE2-LABEL: shuf_zext_8i8_to_8i32:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1669,8 +1659,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1678,7 +1668,6 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(
;
; SSE41-LABEL: shuf_zext_8i8_to_8i32:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -1687,7 +1676,6 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(
;
; AVX1-LABEL: shuf_zext_8i8_to_8i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -1696,13 +1684,11 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(
;
; AVX2-LABEL: shuf_zext_8i8_to_8i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i8_to_8i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -2275,11 +2261,11 @@ define <32 x i32> @zext_32i8_to_32i32(<3
; AVX2-LABEL: zext_32i8_to_32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vmovdqa %ymm4, %ymm0
; AVX2-NEXT: retq
@@ -2303,28 +2289,33 @@ define <2 x i32> @zext_2i8_to_2i32(<2 x
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_2i8_to_2i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSSE3-NEXT: paddq %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: paddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_2i8_to_2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: paddq %xmm0, %xmm0
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: paddd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_2i8_to_2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%x = load <2 x i8>, <2 x i8>* %addr, align 1
%y = zext <2 x i8> %x to <2 x i32>
Modified: llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll Wed Aug 7 09:24:26 2019
@@ -8,12 +8,12 @@
define <8 x i32> @eq_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {
; AVX1-LABEL: eq_zero:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -28,8 +28,8 @@ define <8 x i32> @eq_zero(<8 x i8>* %p,
;
; AVX512-LABEL: eq_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT: vptestnmw %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <8 x i8>, <8 x i8>* %p
@@ -41,14 +41,14 @@ define <8 x i32> @eq_zero(<8 x i8>* %p,
define <4 x i64> @ne_zero(<4 x i16>* %p, <4 x i64> %x, <4 x i64> %y) {
; AVX1-LABEL: ne_zero:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxwq %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -63,8 +63,8 @@ define <4 x i64> @ne_zero(<4 x i16>* %p,
;
; AVX512-LABEL: ne_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k1
; AVX512-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <4 x i16>, <4 x i16>* %p
@@ -111,10 +111,10 @@ define <16 x i16> @sgt_zero(<16 x i8>* %
define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {
; AVX1-LABEL: slt_zero:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbw (%rdi), %xmm2
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -127,8 +127,8 @@ define <8 x i32> @slt_zero(<8 x i8>* %p,
;
; AVX512-LABEL: slt_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbw (%rdi), %xmm2
-; AVX512-NEXT: vpmovw2m %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vpmovb2m %xmm2, %k1
; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <8 x i8>, <8 x i8>* %p
@@ -140,12 +140,12 @@ define <8 x i32> @slt_zero(<8 x i8>* %p,
define <4 x double> @eq_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x double> %y) {
; AVX1-LABEL: eq_zero_fp_select:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -160,8 +160,8 @@ define <4 x double> @eq_zero_fp_select(<
;
; AVX512-LABEL: eq_zero_fp_select:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX512-NEXT: vptestnmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <4 x i8>, <4 x i8>* %p
@@ -173,14 +173,14 @@ define <4 x double> @eq_zero_fp_select(<
define <8 x float> @ne_zero_fp_select(<8 x i8>* %p, <8 x float> %x, <8 x float> %y) {
; AVX1-LABEL: ne_zero_fp_select:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -195,8 +195,8 @@ define <8 x float> @ne_zero_fp_select(<8
;
; AVX512-LABEL: ne_zero_fp_select:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vptestmb %xmm2, %xmm2, %k1
; AVX512-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <8 x i8>, <8 x i8>* %p
@@ -208,12 +208,12 @@ define <8 x float> @ne_zero_fp_select(<8
define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x double> %y) {
; AVX1-LABEL: sgt_zero_fp_select:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd (%rdi), %xmm2
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -228,9 +228,9 @@ define <4 x double> @sgt_zero_fp_select(
;
; AVX512-LABEL: sgt_zero_fp_select:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd (%rdi), %xmm2
+; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k1
+; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k1
; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <4 x i8>, <4 x i8>* %p
Modified: llvm/trunk/test/CodeGen/X86/vselect-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vselect-avx.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vselect-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vselect-avx.ll Wed Aug 7 09:24:26 2019
@@ -96,11 +96,12 @@ define void @test3(<4 x i32> %induction3
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vmovq %xmm1, (%rsi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
@@ -118,11 +119,12 @@ define void @test3(<4 x i32> %induction3
; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vmovq %xmm1, (%rsi)
; AVX2-NEXT: retq
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
%tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
Modified: llvm/trunk/test/CodeGen/X86/vselect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vselect.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vselect.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vselect.ll Wed Aug 7 09:24:26 2019
@@ -563,36 +563,43 @@ define <2 x i64> @shrunkblend_nonvselect
define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
; SSE2-LABEL: simplify_select:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movd %edi, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: simplify_select:
; SSE41: # %bb.0:
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: movd %edi, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE41-NEXT: pinsrd $1, %edi, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: simplify_select:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: simplify_select:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: simplify_select:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vmovd %edi, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: retq
%a = insertelement <2 x i32> <i32 0, i32 undef>, i32 %x, i32 1
%b = insertelement <2 x i32> <i32 undef, i32 0>, i32 %x, i32 0
%y = or <2 x i32> %a, %b
@@ -643,34 +650,28 @@ define i64 @vselect_any_extend_vector_in
; SSE2-LABEL: vselect_any_extend_vector_inreg_crash:
; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pcmpeqw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: psllq $56, %xmm0
+; SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: andl $32768, %eax # imm = 0x8000
; SSE2-NEXT: retq
;
; SSE41-LABEL: vselect_any_extend_vector_inreg_crash:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movl $32768, %eax # imm = 0x8000
-; SSE41-NEXT: movq %rax, %xmm1
-; SSE41-NEXT: xorpd %xmm2, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: andl $32768, %eax # imm = 0x8000
; SSE41-NEXT: retq
;
; AVX-LABEL: vselect_any_extend_vector_inreg_crash:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: andl $32768, %eax # imm = 0x8000
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vshift-4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vshift-4.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vshift-4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vshift-4.ll Wed Aug 7 09:24:26 2019
@@ -58,7 +58,7 @@ define void @shift2a(<4 x i32> %val, <4
; X32-LABEL: shift2a:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-NEXT: pslld %xmm2, %xmm0
@@ -67,7 +67,7 @@ define void @shift2a(<4 x i32> %val, <4
;
; X64-LABEL: shift2a:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X64-NEXT: pslld %xmm2, %xmm0
@@ -84,7 +84,7 @@ define void @shift2b(<4 x i32> %val, <4
; X32-LABEL: shift2b:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-NEXT: pslld %xmm2, %xmm0
@@ -93,7 +93,7 @@ define void @shift2b(<4 x i32> %val, <4
;
; X64-LABEL: shift2b:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X64-NEXT: pslld %xmm2, %xmm0
@@ -110,7 +110,7 @@ define void @shift2c(<4 x i32> %val, <4
; X32-LABEL: shift2c:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-NEXT: pslld %xmm2, %xmm0
@@ -119,7 +119,7 @@ define void @shift2c(<4 x i32> %val, <4
;
; X64-LABEL: shift2c:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X64-NEXT: pslld %xmm2, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/widen_arith-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_arith-1.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_arith-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_arith-1.ll Wed Aug 7 09:24:26 2019
@@ -4,10 +4,9 @@
define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %forcond
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -19,15 +18,14 @@ define void @update(<3 x i8>* %dst, <3 x
; CHECK-NEXT: movl (%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: pmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; CHECK-NEXT: psubd %xmm0, %xmm2
-; CHECK-NEXT: pextrb $8, %xmm2, 2(%ecx,%eax,4)
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: pextrw $0, %xmm2, (%ecx,%eax,4)
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: psubb %xmm0, %xmm1
+; CHECK-NEXT: pextrb $2, %xmm1, 2(%ecx,%eax,4)
+; CHECK-NEXT: pextrw $0, %xmm1, (%ecx,%eax,4)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .LBB0_3: # %afterfor
-; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: popl %eax
; CHECK-NEXT: retl
entry:
%dst.addr = alloca <3 x i8>*
Modified: llvm/trunk/test/CodeGen/X86/widen_arith-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_arith-2.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_arith-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_arith-2.ll Wed Aug 7 09:24:26 2019
@@ -9,7 +9,7 @@ define void @update(i64* %dst_i, i64* %s
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4,4,4,4,4,4,4,4,u,u,u,u,u,u,u,u>
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %forcond
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -25,10 +25,9 @@ define void @update(i64* %dst_i, i64* %s
; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; CHECK-NEXT: psubw %xmm0, %xmm2
+; CHECK-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: psubb %xmm0, %xmm2
; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: packuswb %xmm0, %xmm2
; CHECK-NEXT: movq %xmm2, (%edx,%eax,8)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: jmp .LBB0_1
Modified: llvm/trunk/test/CodeGen/X86/widen_arith-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_arith-3.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_arith-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_arith-3.ll Wed Aug 7 09:24:26 2019
@@ -12,8 +12,7 @@ define void @update(<3 x i16>* %dst, <3
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-8, %esp
-; CHECK-NEXT: subl $32, %esp
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: subl $16, %esp
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $65537, {{[0-9]+}}(%esp) # imm = 0x10001
@@ -29,11 +28,11 @@ define void @update(<3 x i16>* %dst, <3
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl 12(%ebp), %edx
; CHECK-NEXT: movl 8(%ebp), %ecx
-; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; CHECK-NEXT: psubd %xmm0, %xmm2
-; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8)
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: movd %xmm2, (%ecx,%eax,8)
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: pinsrw $2, 4(%edx,%eax,8), %xmm1
+; CHECK-NEXT: psubw %xmm0, %xmm1
+; CHECK-NEXT: pextrw $2, %xmm1, 4(%ecx,%eax,8)
+; CHECK-NEXT: movd %xmm1, (%ecx,%eax,8)
; CHECK-NEXT: incl {{[0-9]+}}(%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .LBB0_3: # %afterfor
Modified: llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_bitops-0.ll Wed Aug 7 09:24:26 2019
@@ -132,15 +132,15 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8
; X32-SSE-LABEL: and_v3i8_as_i24:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X32-SSE-NEXT: # kill: def $al killed $al killed $eax
; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -149,15 +149,15 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8
; X64-SSE-LABEL: and_v3i8_as_i24:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
-; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0
; X64-SSE-NEXT: movd %edi, %xmm1
-; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
-; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1
; X64-SSE-NEXT: pand %xmm0, %xmm1
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X64-SSE-NEXT: # kill: def $al killed $al killed $eax
; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -173,15 +173,15 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8
; X32-SSE-LABEL: xor_v3i8_as_i24:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pxor %xmm0, %xmm1
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X32-SSE-NEXT: # kill: def $al killed $al killed $eax
; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -190,15 +190,15 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8
; X64-SSE-LABEL: xor_v3i8_as_i24:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
-; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0
; X64-SSE-NEXT: movd %edi, %xmm1
-; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
-; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1
; X64-SSE-NEXT: pxor %xmm0, %xmm1
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X64-SSE-NEXT: # kill: def $al killed $al killed $eax
; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -214,15 +214,15 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8>
; X32-SSE-LABEL: or_v3i8_as_i24:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X32-SSE-NEXT: # kill: def $al killed $al killed $eax
; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -231,15 +231,15 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8>
; X64-SSE-LABEL: or_v3i8_as_i24:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
-; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0
; X64-SSE-NEXT: movd %edi, %xmm1
-; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
-; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1
; X64-SSE-NEXT: por %xmm0, %xmm1
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X64-SSE-NEXT: # kill: def $al killed $al killed $eax
; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
Modified: llvm/trunk/test/CodeGen/X86/widen_cast-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-1.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-1.ll Wed Aug 7 09:24:26 2019
@@ -12,7 +12,6 @@ define void @convert(<2 x i32>* %dst, <4
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: cmpl $3, (%esp)
; CHECK-NEXT: jg .LBB0_3
; CHECK-NEXT: .p2align 4, 0x90
@@ -21,10 +20,9 @@ define void @convert(<2 x i32>* %dst, <4
; CHECK-NEXT: movl (%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; CHECK-NEXT: psubw %xmm0, %xmm2
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: movq %xmm2, (%ecx,%eax,8)
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: psubw %xmm0, %xmm1
+; CHECK-NEXT: movq %xmm1, (%ecx,%eax,8)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: cmpl $3, (%esp)
; CHECK-NEXT: jle .LBB0_2
@@ -36,7 +34,6 @@ define void @convert(<2 x i32>* %dst, <4
; ATOM: # %bb.0: # %entry
; ATOM-NEXT: pushl %eax
; ATOM-NEXT: pcmpeqd %xmm0, %xmm0
-; ATOM-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ATOM-NEXT: movl $0, (%esp)
; ATOM-NEXT: cmpl $3, (%esp)
; ATOM-NEXT: jg .LBB0_3
@@ -45,12 +42,10 @@ define void @convert(<2 x i32>* %dst, <4
; ATOM-NEXT: # =>This Inner Loop Header: Depth=1
; ATOM-NEXT: movl (%esp), %eax
; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; ATOM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; ATOM-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; ATOM-NEXT: psubw %xmm0, %xmm2
-; ATOM-NEXT: pshufb %xmm1, %xmm2
-; ATOM-NEXT: movq %xmm2, (%ecx,%eax,8)
+; ATOM-NEXT: psubw %xmm0, %xmm1
+; ATOM-NEXT: movq %xmm1, (%ecx,%eax,8)
; ATOM-NEXT: incl (%esp)
; ATOM-NEXT: cmpl $3, (%esp)
; ATOM-NEXT: jle .LBB0_2
Modified: llvm/trunk/test/CodeGen/X86/widen_cast-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-2.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-2.ll Wed Aug 7 09:24:26 2019
@@ -21,8 +21,9 @@ define void @convert(<7 x i32>* %dst, <1
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2
; CHECK-NEXT: psubw %xmm0, %xmm1
; CHECK-NEXT: psubw %xmm0, %xmm2
+; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
+; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax)
-; CHECK-NEXT: movq %xmm2, 16(%ecx,%eax)
; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: cmpl $3, (%esp)
Modified: llvm/trunk/test/CodeGen/X86/widen_cast-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-3.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-3.ll Wed Aug 7 09:24:26 2019
@@ -11,7 +11,8 @@ define void @convert(<12 x i8>* %dst.add
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
+; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert:
Modified: llvm/trunk/test/CodeGen/X86/widen_cast-4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-4.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-4.ll Wed Aug 7 09:24:26 2019
@@ -10,7 +10,8 @@ define void @update(i64* %dst_i, i64* %s
; NARROW-NEXT: subl $12, %esp
; NARROW-NEXT: movl $0, (%esp)
; NARROW-NEXT: pcmpeqd %xmm0, %xmm0
-; NARROW-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; NARROW-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; NARROW-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; NARROW-NEXT: .p2align 4, 0x90
; NARROW-NEXT: .LBB0_1: # %forcond
; NARROW-NEXT: # =>This Inner Loop Header: Depth=1
@@ -26,13 +27,13 @@ define void @update(i64* %dst_i, i64* %s
; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp)
; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx
; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; NARROW-NEXT: psubw %xmm0, %xmm2
-; NARROW-NEXT: psllw $8, %xmm2
-; NARROW-NEXT: psraw $8, %xmm2
-; NARROW-NEXT: psrlw $2, %xmm2
-; NARROW-NEXT: pshufb %xmm1, %xmm2
-; NARROW-NEXT: movq %xmm2, (%edx,%eax,8)
+; NARROW-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; NARROW-NEXT: psubb %xmm0, %xmm3
+; NARROW-NEXT: psrlw $2, %xmm3
+; NARROW-NEXT: pand %xmm1, %xmm3
+; NARROW-NEXT: pxor %xmm2, %xmm3
+; NARROW-NEXT: psubb %xmm2, %xmm3
+; NARROW-NEXT: movq %xmm3, (%edx,%eax,8)
; NARROW-NEXT: incl (%esp)
; NARROW-NEXT: jmp .LBB0_1
; NARROW-NEXT: .LBB0_3: # %afterfor
Modified: llvm/trunk/test/CodeGen/X86/widen_cast-5.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-5.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-5.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-5.ll Wed Aug 7 09:24:26 2019
@@ -8,18 +8,15 @@ define void @convert(<2 x i32>* %dst.add
; X86-LABEL: convert:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; X86-NEXT: pxor LCPI0_0, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: xorps LCPI0_0, %xmm0
+; X86-NEXT: movlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq %rsi, %xmm0
-; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-NEXT: pxor {{.*}}(%rip), %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/widen_cast-6.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-6.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-6.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-6.ll Wed Aug 7 09:24:26 2019
@@ -7,9 +7,7 @@
define i32 @return_v2hi() nounwind {
; X86-LABEL: return_v2hi:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: pushl %eax
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: popl %ecx
; X86-NEXT: retl
;
; X64-LABEL: return_v2hi:
Modified: llvm/trunk/test/CodeGen/X86/widen_compare-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_compare-1.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_compare-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_compare-1.ll Wed Aug 7 09:24:26 2019
@@ -7,12 +7,12 @@
define <2 x i16> @compare_v2i64_to_v2i16_unary(<2 x i16>* %src) nounwind {
; X86-LABEL: compare_v2i64_to_v2i16_unary:
; X86: # %bb.0:
-; X86-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0]
+; X86-NEXT: pcmpeqd %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: compare_v2i64_to_v2i16_unary:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [65535,65535]
+; X64-NEXT: pcmpeqd %xmm0, %xmm0
; X64-NEXT: retq
%val = load <2 x i16>, <2 x i16>* %src, align 4
%cmp = icmp uge <2 x i16> %val, %val
@@ -25,20 +25,18 @@ define <2 x i16> @compare_v2i64_to_v2i16
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-NEXT: pcmpgtq %xmm0, %xmm1
-; X86-NEXT: pcmpeqd %xmm0, %xmm0
-; X86-NEXT: pxor %xmm1, %xmm0
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pmaxuw %xmm1, %xmm0
+; X86-NEXT: pcmpeqw %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: compare_v2i64_to_v2i16_binary:
; X64: # %bb.0:
-; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-NEXT: pcmpgtq %xmm0, %xmm1
-; X64-NEXT: pcmpeqd %xmm0, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm0
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: pmaxuw %xmm1, %xmm0
+; X64-NEXT: pcmpeqw %xmm1, %xmm0
; X64-NEXT: retq
%val0 = load <2 x i16>, <2 x i16>* %src0, align 4
%val1 = load <2 x i16>, <2 x i16>* %src1, align 4
Modified: llvm/trunk/test/CodeGen/X86/widen_conv-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_conv-1.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_conv-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_conv-1.ll Wed Aug 7 09:24:26 2019
@@ -8,16 +8,17 @@ define void @convert_v2i64_to_v2i32(<2 x
; X86-LABEL: convert_v2i64_to_v2i32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert_v2i64_to_v2i32:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddd {{.*}}(%rip), %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: psubd %xmm1, %xmm0
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
entry:
@@ -32,25 +33,23 @@ entry:
define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) nounwind {
; X86-LABEL: convert_v3i32_to_v3i8:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movdqa (%ecx), %xmm0
-; X86-NEXT: pcmpeqd %xmm1, %xmm1
-; X86-NEXT: psubd %xmm1, %xmm0
-; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: psubb %xmm1, %xmm0
+; X86-NEXT: pextrb $2, %xmm0, 2(%eax)
; X86-NEXT: pextrw $0, %xmm0, (%eax)
-; X86-NEXT: popl %eax
; X86-NEXT: retl
;
; X64-LABEL: convert_v3i32_to_v3i8:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
-; X64-NEXT: pcmpeqd %xmm1, %xmm1
-; X64-NEXT: psubd %xmm1, %xmm0
-; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: pextrb $2, %xmm0, 2(%rdi)
; X64-NEXT: pextrw $0, %xmm0, (%rdi)
; X64-NEXT: retq
entry:
@@ -66,29 +65,23 @@ entry:
define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind {
; X86-LABEL: convert_v5i16_to_v5i8:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movdqa (%ecx), %xmm0
-; X86-NEXT: pcmpeqd %xmm1, %xmm1
-; X86-NEXT: psubw %xmm1, %xmm0
-; X86-NEXT: pextrb $8, %xmm0, 4(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: psubb %xmm1, %xmm0
+; X86-NEXT: pextrb $4, %xmm0, 4(%eax)
; X86-NEXT: movd %xmm0, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: convert_v5i16_to_v5i8:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
-; X64-NEXT: pcmpeqd %xmm1, %xmm1
-; X64-NEXT: psubw %xmm1, %xmm0
-; X64-NEXT: pextrb $8, %xmm0, 4(%rdi)
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: pextrb $4, %xmm0, 4(%rdi)
; X64-NEXT: movd %xmm0, (%rdi)
; X64-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/widen_conv-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_conv-2.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_conv-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_conv-2.ll Wed Aug 7 09:24:26 2019
@@ -8,17 +8,13 @@ define void @convert_v2i16_v2i32(<2 x i3
; X86-LABEL: convert_v2i16_v2i32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: psllq $48, %xmm0
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmovsxwd %xmm0, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert_v2i16_v2i32:
; X64: # %bb.0: # %entry
-; X64-NEXT: psllq $48, %xmm0
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: pmovsxwd %xmm0, %xmm0
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/widen_conv-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_conv-3.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_conv-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_conv-3.ll Wed Aug 7 09:24:26 2019
@@ -7,24 +7,37 @@
; sign to float v2i16 to v2f32
define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
-; X86-LABEL: convert_v2i16_to_v2f32:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: psllq $48, %xmm0
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT: cvtdq2ps %xmm0, %xmm0
-; X86-NEXT: movlps %xmm0, (%eax)
-; X86-NEXT: retl
-;
-; X64-LABEL: convert_v2i16_to_v2f32:
-; X64: # %bb.0: # %entry
-; X64-NEXT: psllq $48, %xmm0
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-NEXT: cvtdq2ps %xmm0, %xmm0
-; X64-NEXT: movlps %xmm0, (%rdi)
-; X64-NEXT: retq
+; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE2: # %bb.0: # %entry
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movlps %xmm0, (%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE42: # %bb.0: # %entry
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: pmovsxwd %xmm0, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: movlps %xmm0, (%eax)
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v2i16_to_v2f32:
+; X64-SSE2: # %bb.0: # %entry
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v2i16_to_v2f32:
+; X64-SSE42: # %bb.0: # %entry
+; X64-SSE42-NEXT: pmovsxwd %xmm0, %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE42-NEXT: retq
entry:
%val = sitofp <2 x i16> %src to <2 x float>
store <2 x float> %val, <2 x float>* %dst.addr, align 4
@@ -36,28 +49,19 @@ entry:
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
; X86-SSE2: # %bb.0: # %entry
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: andl $-16, %esp
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movzwl (%ecx), %edx
; X86-SSE2-NEXT: movd %edx, %xmm0
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT: shll $8, %edx
-; X86-SSE2-NEXT: pxor %xmm0, %xmm0
-; X86-SSE2-NEXT: pinsrw $1, %edx, %xmm0
-; X86-SSE2-NEXT: shll $8, %esi
-; X86-SSE2-NEXT: pinsrw $3, %esi, %xmm0
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
-; X86-SSE2-NEXT: shll $8, %ecx
-; X86-SSE2-NEXT: pinsrw $5, %ecx, %xmm0
+; X86-SSE2-NEXT: movd %ecx, %xmm2
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: pandn %xmm2, %xmm1
+; X86-SSE2-NEXT: por %xmm0, %xmm1
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE2-NEXT: psrad $24, %xmm0
; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
@@ -66,47 +70,35 @@ define void @convert_v3i8_to_v3f32(<3 x
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT: leal -4(%ebp), %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
; X86-SSE42: # %bb.0: # %entry
-; X86-SSE42-NEXT: pushl %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
-; X86-SSE42-NEXT: movzwl (%ecx), %ecx
-; X86-SSE42-NEXT: movd %ecx, %xmm0
-; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
-; X86-SSE42-NEXT: pslld $24, %xmm0
-; X86-SSE42-NEXT: psrad $24, %xmm0
+; X86-SSE42-NEXT: movzwl (%ecx), %edx
+; X86-SSE42-NEXT: movd %edx, %xmm0
+; X86-SSE42-NEXT: pinsrb $2, 2(%ecx), %xmm0
+; X86-SSE42-NEXT: pmovsxbd %xmm0, %xmm0
; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
; X86-SSE42-NEXT: movss %xmm0, (%eax)
-; X86-SSE42-NEXT: popl %eax
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
; X64-SSE2-NEXT: movd %eax, %xmm0
-; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT: shll $8, %eax
-; X64-SSE2-NEXT: pxor %xmm0, %xmm0
-; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
-; X64-SSE2-NEXT: shll $8, %ecx
-; X64-SSE2-NEXT: pinsrw $3, %ecx, %xmm0
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE2-NEXT: shll $8, %eax
-; X64-SSE2-NEXT: pinsrw $5, %eax, %xmm0
+; X64-SSE2-NEXT: movd %eax, %xmm2
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: pandn %xmm2, %xmm1
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE2-NEXT: psrad $24, %xmm0
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
@@ -116,13 +108,10 @@ define void @convert_v3i8_to_v3f32(<3 x
;
; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE42-NEXT: movzwl (%rsi), %ecx
-; X64-SSE42-NEXT: movd %ecx, %xmm0
-; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
-; X64-SSE42-NEXT: pslld $24, %xmm0
-; X64-SSE42-NEXT: psrad $24, %xmm0
+; X64-SSE42-NEXT: movzwl (%rsi), %eax
+; X64-SSE42-NEXT: movd %eax, %xmm0
+; X64-SSE42-NEXT: pinsrb $2, 2(%rsi), %xmm0
+; X64-SSE42-NEXT: pmovsxbd %xmm0, %xmm0
; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
Modified: llvm/trunk/test/CodeGen/X86/widen_conv-4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_conv-4.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_conv-4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_conv-4.ll Wed Aug 7 09:24:26 2019
@@ -28,15 +28,15 @@ define void @convert_v7i16_v7f32(<7 x fl
; X86-SSE42-LABEL: convert_v7i16_v7f32:
; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: pxor %xmm1, %xmm1
-; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1
+; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
-; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
-; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax)
-; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax)
-; X86-SSE42-NEXT: movups %xmm1, (%eax)
-; X86-SSE42-NEXT: movss %xmm0, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: extractps $2, %xmm1, 24(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm1, 20(%eax)
+; X86-SSE42-NEXT: movss %xmm1, 16(%eax)
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v7i16_v7f32:
@@ -55,14 +55,14 @@ define void @convert_v7i16_v7f32(<7 x fl
;
; X64-SSE42-LABEL: convert_v7i16_v7f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: pxor %xmm1, %xmm1
-; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1
+; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
-; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
-; X64-SSE42-NEXT: extractps $2, %xmm0, 24(%rdi)
-; X64-SSE42-NEXT: movlps %xmm0, 16(%rdi)
-; X64-SSE42-NEXT: movups %xmm1, (%rdi)
+; X64-SSE42-NEXT: movups %xmm0, (%rdi)
+; X64-SSE42-NEXT: extractps $2, %xmm1, 24(%rdi)
+; X64-SSE42-NEXT: movlps %xmm1, 16(%rdi)
; X64-SSE42-NEXT: retq
entry:
%val = uitofp <7 x i16> %src to <7 x float>
@@ -75,66 +75,58 @@ entry:
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
; X86-SSE2: # %bb.0: # %entry
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-16, %esp
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movzwl (%ecx), %edx
; X86-SSE2-NEXT: movd %edx, %xmm0
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
-; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
-; X86-SSE2-NEXT: movzbl (%esp), %edx
-; X86-SSE2-NEXT: movd %edx, %xmm0
-; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: pinsrw $2, %edx, %xmm0
-; X86-SSE2-NEXT: pinsrw $4, %ecx, %xmm0
-; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movd %ecx, %xmm2
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: pandn %xmm2, %xmm1
+; X86-SSE2-NEXT: por %xmm0, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
; X86-SSE2-NEXT: movaps %xmm0, %xmm1
; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
; X86-SSE42: # %bb.0: # %entry
-; X86-SSE42-NEXT: pushl %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
-; X86-SSE42-NEXT: movzwl (%ecx), %ecx
-; X86-SSE42-NEXT: movd %ecx, %xmm0
+; X86-SSE42-NEXT: movzwl (%ecx), %edx
+; X86-SSE42-NEXT: movd %edx, %xmm0
+; X86-SSE42-NEXT: pinsrb $2, 2(%ecx), %xmm0
; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
-; X86-SSE42-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
; X86-SSE42-NEXT: movss %xmm0, (%eax)
-; X86-SSE42-NEXT: popl %eax
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
; X64-SSE2-NEXT: movd %eax, %xmm0
-; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT: movd %ecx, %xmm0
-; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
-; X64-SSE2-NEXT: pinsrw $4, %eax, %xmm0
-; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movd %eax, %xmm2
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: pandn %xmm2, %xmm1
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm0
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
@@ -142,12 +134,10 @@ define void @convert_v3i8_to_v3f32(<3 x
;
; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE42-NEXT: movzwl (%rsi), %ecx
-; X64-SSE42-NEXT: movd %ecx, %xmm0
+; X64-SSE42-NEXT: movzwl (%rsi), %eax
+; X64-SSE42-NEXT: movd %eax, %xmm0
+; X64-SSE42-NEXT: pinsrb $2, 2(%rsi), %xmm0
; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
-; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
Modified: llvm/trunk/test/CodeGen/X86/widen_load-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_load-2.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_load-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll Wed Aug 7 09:24:26 2019
@@ -15,7 +15,8 @@ define void @add3i32(%i32vec3* sret %re
; X86-NEXT: movdqa (%edx), %xmm0
; X86-NEXT: paddd (%ecx), %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
+; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32:
@@ -39,13 +40,16 @@ define void @add3i32_2(%i32vec3* sret %
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrd $1, 4(%edx), %xmm0
; X86-NEXT: pinsrd $2, 8(%edx), %xmm0
-; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1
; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1
; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: pextrd $1, %xmm1, 4(%eax)
; X86-NEXT: pextrd $2, %xmm1, 8(%eax)
+; X86-NEXT: movd %xmm1, (%eax)
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32_2:
@@ -77,8 +81,9 @@ define void @add7i32(%i32vec7* sret %re
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddd (%ecx), %xmm0
; X86-NEXT: paddd 16(%ecx), %xmm1
+; X86-NEXT: movd %xmm1, 16(%eax)
+; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
-; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
@@ -143,31 +148,25 @@ define void @add12i32(%i32vec12* sret %
define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
; X86-LABEL: add3i16:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl 16(%ebp), %ecx
-; X86-NEXT: movl 12(%ebp), %edx
-; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: pextrw $4, %xmm1, 4(%eax)
-; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrw $2, 4(%edx), %xmm0
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrw $2, 4(%ecx), %xmm1
+; X86-NEXT: paddw %xmm0, %xmm1
+; X86-NEXT: pextrw $2, %xmm1, 4(%eax)
; X86-NEXT: movd %xmm1, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
; X64-LABEL: add3i16:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: pextrw $4, %xmm1, 4(%rdi)
-; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: paddw %xmm0, %xmm1
+; X64-NEXT: pextrw $2, %xmm1, 4(%rdi)
; X64-NEXT: movd %xmm1, (%rdi)
; X64-NEXT: retq
%a = load %i16vec3, %i16vec3* %ap, align 16
@@ -216,7 +215,8 @@ define void @add12i16(%i16vec12* nocaptu
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddw (%ecx), %xmm0
; X86-NEXT: paddw 16(%ecx), %xmm1
-; X86-NEXT: movq %xmm1, 16(%eax)
+; X86-NEXT: movd %xmm1, 16(%eax)
+; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
@@ -280,27 +280,23 @@ define void @add18i16(%i16vec18* nocaptu
define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
; X86-LABEL: add3i8:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: pextrb $8, %xmm1, 2(%eax)
-; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: paddb %xmm0, %xmm1
+; X86-NEXT: pextrb $2, %xmm1, 2(%eax)
; X86-NEXT: pextrw $0, %xmm1, (%eax)
-; X86-NEXT: addl $12, %esp
; X86-NEXT: retl $4
;
; X64-LABEL: add3i8:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: pextrb $8, %xmm1, 2(%rdi)
-; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: paddb %xmm0, %xmm1
+; X64-NEXT: pextrb $2, %xmm1, 2(%rdi)
; X64-NEXT: pextrw $0, %xmm1, (%rdi)
; X64-NEXT: retq
%a = load %i8vec3, %i8vec3* %ap, align 16
@@ -321,10 +317,11 @@ define void @add31i8(%i8vec31* nocapture
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddb (%ecx), %xmm0
; X86-NEXT: paddb 16(%ecx), %xmm1
+; X86-NEXT: movd %xmm1, 16(%eax)
+; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
; X86-NEXT: pextrw $6, %xmm1, 28(%eax)
; X86-NEXT: pextrb $14, %xmm1, 30(%eax)
-; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
@@ -353,7 +350,6 @@ define void @add31i8(%i8vec31* nocapture
define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
; X86-LABEL: rot:
; X86: # %bb.0: # %entry
-; X86-NEXT: subl $16, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -361,12 +357,11 @@ define void @rot(%i8vec3pack* nocapture
; X86-NEXT: movw $-24930, (%edx) # imm = 0x9E9E
; X86-NEXT: movb $1, 2(%ecx)
; X86-NEXT: movw $257, (%ecx) # imm = 0x101
-; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: psrld $1, %xmm0
-; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
-; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: psrlw $1, %xmm0
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pextrb $2, %xmm0, 2(%eax)
; X86-NEXT: pextrw $0, %xmm0, (%eax)
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl $4
;
; X64-LABEL: rot:
@@ -376,10 +371,10 @@ define void @rot(%i8vec3pack* nocapture
; X64-NEXT: movw $-24930, (%rsi) # imm = 0x9E9E
; X64-NEXT: movb $1, 2(%rdx)
; X64-NEXT: movw $257, (%rdx) # imm = 0x101
-; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: psrld $1, %xmm0
-; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: psrlw $1, %xmm0
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-NEXT: pextrb $2, %xmm0, 2(%rdi)
; X64-NEXT: pextrw $0, %xmm0, (%rdi)
; X64-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll Wed Aug 7 09:24:26 2019
@@ -89,18 +89,12 @@ entry:
define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; X86-LABEL: shuf4:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X86-NEXT: pshufb %xmm2, %xmm1
-; X86-NEXT: pshufb %xmm2, %xmm0
-; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: shuf4:
; X64: # %bb.0:
-; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X64-NEXT: pshufb %xmm2, %xmm1
-; X64-NEXT: pshufb %xmm2, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
%vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %vshuf
Modified: llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll (original)
+++ llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll Wed Aug 7 09:24:26 2019
@@ -389,29 +389,30 @@ ret void
define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX-LABEL: interleaved_load_vf8_i8_stride4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX-NEXT: vmovdqa (%rdi), %xmm1
; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm3[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
-; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT: vpaddw %xmm0, %xmm4, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7]
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
%v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -888,13 +889,8 @@ define <32 x i1> @interleaved_load_vf32_
define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
; AVX-LABEL: interleaved_store_vf8_i8_stride4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1
-; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
@@ -1017,17 +1013,17 @@ define <8 x i8> @interleaved_load_vf8_i8
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX-NEXT: retq
%wide.vec = load <24 x i8>, <24 x i8>* %ptr
%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
@@ -1041,19 +1037,15 @@ define <8 x i8> @interleaved_load_vf8_i8
define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
; AVX-LABEL: interleaved_store_vf8_i8_stride3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
-; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
+; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, 16(%rdi)
-; AVX-NEXT: vmovdqu %xmm2, (%rdi)
+; AVX-NEXT: vmovdqu %xmm1, (%rdi)
; AVX-NEXT: retq
%1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Modified: llvm/trunk/test/CodeGen/X86/x86-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-shifts.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/x86-shifts.ll Wed Aug 7 09:24:26 2019
@@ -254,16 +254,16 @@ define <2 x i32> @shl2_other(<2 x i32> %
; X32-LABEL: shl2_other:
; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
-; X32-NEXT: psllq $2, %xmm1
-; X32-NEXT: psllq $9, %xmm0
+; X32-NEXT: pslld $2, %xmm1
+; X32-NEXT: pslld $9, %xmm0
; X32-NEXT: pxor %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: shl2_other:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psllq $2, %xmm1
-; X64-NEXT: psllq $9, %xmm0
+; X64-NEXT: pslld $2, %xmm1
+; X64-NEXT: pslld $9, %xmm0
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: retq
entry:
@@ -276,19 +276,17 @@ entry:
define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
; X32-LABEL: shr2_other:
; X32: # %bb.0: # %entry
-; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-NEXT: movdqa %xmm0, %xmm1
-; X32-NEXT: psrlq $8, %xmm1
-; X32-NEXT: psrlq $1, %xmm0
+; X32-NEXT: psrld $8, %xmm1
+; X32-NEXT: psrld $1, %xmm0
; X32-NEXT: pxor %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: shr2_other:
; X64: # %bb.0: # %entry
-; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psrlq $8, %xmm1
-; X64-NEXT: psrlq $1, %xmm0
+; X64-NEXT: psrld $8, %xmm1
+; X64-NEXT: psrld $1, %xmm0
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: retq
entry:
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll Wed Aug 7 09:24:26 2019
@@ -22,9 +22,19 @@ define <2 x i8> @g(<2 x i8> %x, <2 x i8>
define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: ret <4 x i8> [[TMP2]]
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
+; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT: ret <4 x i8> [[INS4]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
@@ -43,9 +53,16 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8>
define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h_undef(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: ret <4 x i8> [[TMP2]]
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
+; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT: ret <4 x i8> [[INS4]]
;
%x0 = extractelement <4 x i8> undef, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
@@ -64,13 +81,17 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4
define i8 @i(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @i(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i8> [[TMP2]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i8> [[BIN_RDX]], <4 x i8> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/fptosi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/fptosi.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/fptosi.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/fptosi.ll Wed Aug 7 09:24:26 2019
@@ -160,38 +160,11 @@ define void @fptosi_8f64_8i32() #0 {
}
define void @fptosi_8f64_8i16() #0 {
-; SSE-LABEL: @fptosi_8f64_8i16(
-; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i16
-; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i16
-; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i16
-; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i16
-; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i16
-; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i16
-; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i16
-; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i16
-; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
-; SSE-NEXT: ret void
-;
-; AVX-LABEL: @fptosi_8f64_8i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16>
-; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX-NEXT: ret void
+; CHECK-LABEL: @fptosi_8f64_8i16(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16>
+; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/fptoui.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/fptoui.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/fptoui.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/fptoui.ll Wed Aug 7 09:24:26 2019
@@ -238,44 +238,11 @@ define void @fptoui_8f64_8i16() #0 {
; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
; SSE-NEXT: ret void
;
-; AVX256NODQ-LABEL: @fptoui_8f64_8i16(
-; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16
-; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16
-; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16
-; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16
-; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16
-; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16
-; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16
-; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16
-; AVX256NODQ-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
-; AVX256NODQ-NEXT: ret void
-;
-; AVX512-LABEL: @fptoui_8f64_8i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
-; AVX512-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX512-NEXT: ret void
-;
-; AVX256DQ-LABEL: @fptoui_8f64_8i16(
-; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
-; AVX256DQ-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX256DQ-NEXT: ret void
+; AVX-LABEL: @fptoui_8f64_8i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
+; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll Wed Aug 7 09:24:26 2019
@@ -344,16 +344,22 @@ define <4 x float> @simple_select_no_use
; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
-; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1
+; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
-; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
+; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
+; ZEROTHRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1
+; ZEROTHRESH-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
-; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2
-; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
+; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2
+; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3
; ZEROTHRESH-NEXT: ret <4 x float> [[RD]]
;
%c0 = extractelement <4 x i32> %c, i32 0
@@ -430,18 +436,12 @@ define <2 x float> @simple_select_v2(<2
; CHECK-NEXT: ret <2 x float> [[RB]]
;
; ZEROTHRESH-LABEL: @simple_select_v2(
-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0
-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1
-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1
-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0
-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1
-; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
-; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
-; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0
-; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
+; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
+; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
; ZEROTHRESH-NEXT: ret <2 x float> [[RB]]
;
%c0 = extractelement <2 x i32> %c, i32 0
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll Wed Aug 7 09:24:26 2019
@@ -916,11 +916,26 @@ define void @sitofp_16i32_16f32() #0 {
}
define void @sitofp_4i16_4f32() #0 {
-; CHECK-LABEL: @sitofp_4i16_4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; CHECK-NEXT: ret void
+; SSE-LABEL: @sitofp_4i16_4f32(
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @sitofp_4i16_4f32(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX-NEXT: ret void
;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -939,12 +954,30 @@ define void @sitofp_4i16_4f32() #0 {
define void @sitofp_8i16_8f32() #0 {
; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @sitofp_8i16_8f32(
@@ -982,18 +1015,54 @@ define void @sitofp_8i16_8f32() #0 {
define void @sitofp_16i16_16f32() #0 {
; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
+; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
+; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
+; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
+; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
+; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
+; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
+; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
+; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
+; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
+; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
+; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sitofp_16i16_16f32(
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll?rev=368183&r1=368182&r2=368183&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/uitofp.ll Wed Aug 7 09:24:26 2019
@@ -868,11 +868,26 @@ define void @uitofp_16i32_16f32() #0 {
}
define void @uitofp_4i16_4f32() #0 {
-; CHECK-LABEL: @uitofp_4i16_4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; CHECK-NEXT: ret void
+; SSE-LABEL: @uitofp_4i16_4f32(
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @uitofp_4i16_4f32(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX-NEXT: ret void
;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -891,12 +906,30 @@ define void @uitofp_4i16_4f32() #0 {
define void @uitofp_8i16_8f32() #0 {
; SSE-LABEL: @uitofp_8i16_8f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @uitofp_8i16_8f32(
@@ -934,18 +967,54 @@ define void @uitofp_8i16_8f32() #0 {
define void @uitofp_16i16_16f32() #0 {
; SSE-LABEL: @uitofp_16i16_16f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
+; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
+; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
+; SSE-NEXT: [[CVT8:%.*]] = uitofp i16 [[LD8]] to float
+; SSE-NEXT: [[CVT9:%.*]] = uitofp i16 [[LD9]] to float
+; SSE-NEXT: [[CVT10:%.*]] = uitofp i16 [[LD10]] to float
+; SSE-NEXT: [[CVT11:%.*]] = uitofp i16 [[LD11]] to float
+; SSE-NEXT: [[CVT12:%.*]] = uitofp i16 [[LD12]] to float
+; SSE-NEXT: [[CVT13:%.*]] = uitofp i16 [[LD13]] to float
+; SSE-NEXT: [[CVT14:%.*]] = uitofp i16 [[LD14]] to float
+; SSE-NEXT: [[CVT15:%.*]] = uitofp i16 [[LD15]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
+; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @uitofp_16i16_16f32(
More information about the llvm-commits
mailing list