[llvm] r347242 - [X86][SSE] Remove unnecessary bit-and in pshufb vector ctlz (PR39703)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 19 10:40:59 PST 2018


Author: rksimon
Date: Mon Nov 19 10:40:59 2018
New Revision: 347242

URL: http://llvm.org/viewvc/llvm-project?rev=347242&view=rev
Log:
[X86][SSE] Remove unnecessary bit-and in pshufb vector ctlz (PR39703)

SSE PSHUFB vector ctlz lowering works at the i4 nibble level. As detailed in PR39703, we were masking the lower nibble off but we only actually use it in the case where the upper nibble is known to be zero, making it safe to remove the mask and save an instruction.

Differential Revision: https://reviews.llvm.org/D54707

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/combine-srl.ll
    llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=347242&r1=347241&r2=347242&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Nov 19 10:40:59 2018
@@ -23103,9 +23103,8 @@ static SDValue LowerVectorCTLZInRegLUT(S
   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
 
-  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
-  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+  SDValue Lo = Op0;
   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
   SDValue HiZ;
   if (CurrVT.is512BitVector()) {

Modified: llvm/trunk/test/CodeGen/X86/combine-srl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-srl.ll?rev=347242&r1=347241&r2=347242&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-srl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-srl.ll Mon Nov 19 10:40:59 2018
@@ -347,50 +347,48 @@ define <4 x i32> @combine_vec_lshr_lzcnt
 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pshufb %xmm0, %xmm3
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrlw $4, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pshufb %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE-NEXT:    pand %xmm0, %xmm5
-; SSE-NEXT:    pshufb %xmm5, %xmm4
-; SSE-NEXT:    pand %xmm1, %xmm4
-; SSE-NEXT:    paddb %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pshufb %xmm1, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm3
-; SSE-NEXT:    paddw %xmm1, %xmm3
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    psrlw $8, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    psrld $16, %xmm3
-; SSE-NEXT:    paddd %xmm3, %xmm0
-; SSE-NEXT:    psrld $5, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    psrld $5, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
-; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
-; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1

Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll?rev=347242&r1=347241&r2=347242&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-lzcnt.ll Mon Nov 19 10:40:59 2018
@@ -38,17 +38,15 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; AVX256-LABEL: testv16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX256-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX256-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX256-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX256-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX256-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX256-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX256-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX256-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX256-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX256-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX256-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX256-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX256-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX256-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX256-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX256-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8:
@@ -93,17 +91,15 @@ define <16 x i16> @testv16i16(<16 x i16>
 define <32 x i8> @testv32i8(<32 x i8> %in) {
 ; AVX256-LABEL: testv32i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX256-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX256-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX256-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX256-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX256-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX256-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX256-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX256-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX256-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX256-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX256-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX256-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX256-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX256-NEXT:    retq
 ;
 ; AVX512-LABEL: testv32i8:

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll?rev=347242&r1=347241&r2=347242&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll Mon Nov 19 10:40:59 2018
@@ -96,33 +96,30 @@ define <2 x i64> @testv2i64(<2 x i64> %i
 ;
 ; SSSE3-LABEL: testv2i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSSE3-NEXT:    psrld $16, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSSE3-NEXT:    psrld $16, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrld $16, %xmm1
-; SSSE3-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    paddd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm1
@@ -132,33 +129,30 @@ define <2 x i64> @testv2i64(<2 x i64> %i
 ;
 ; SSE41-LABEL: testv2i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSE41-NEXT:    psrld $16, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm1
@@ -168,16 +162,14 @@ define <2 x i64> @testv2i64(<2 x i64> %i
 ;
 ; AVX-LABEL: testv2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -198,16 +190,14 @@ define <2 x i64> @testv2i64(<2 x i64> %i
 ;
 ; AVX512VLBWDQ-LABEL: testv2i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -241,33 +231,30 @@ define <2 x i64> @testv2i64(<2 x i64> %i
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
-; X32-SSE-NEXT:    psrld $16, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
+; X32-SSE-NEXT:    psrld $16, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    paddd %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; X32-SSE-NEXT:    paddd %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm1
@@ -362,33 +349,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %
 ;
 ; SSSE3-LABEL: testv2i64u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSSE3-NEXT:    psrld $16, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSSE3-NEXT:    psrld $16, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrld $16, %xmm1
-; SSSE3-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    paddd %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlq $32, %xmm1
@@ -398,33 +382,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %
 ;
 ; SSE41-LABEL: testv2i64u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm3
-; SSE41-NEXT:    psrld $16, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm1
@@ -434,16 +415,14 @@ define <2 x i64> @testv2i64u(<2 x i64> %
 ;
 ; AVX-LABEL: testv2i64u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -464,16 +443,14 @@ define <2 x i64> @testv2i64u(<2 x i64> %
 ;
 ; AVX512VLBWDQ-LABEL: testv2i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -507,33 +484,30 @@ define <2 x i64> @testv2i64u(<2 x i64> %
 ;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
-; X32-SSE-NEXT:    psrld $16, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
+; X32-SSE-NEXT:    psrld $16, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    paddd %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; X32-SSE-NEXT:    paddd %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlq $32, %xmm1
@@ -632,27 +606,24 @@ define <4 x i32> @testv4i32(<4 x i32> %i
 ;
 ; SSSE3-LABEL: testv4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm1
@@ -662,27 +633,24 @@ define <4 x i32> @testv4i32(<4 x i32> %i
 ;
 ; SSE41-LABEL: testv4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm1
@@ -692,16 +660,14 @@ define <4 x i32> @testv4i32(<4 x i32> %i
 ;
 ; AVX-LABEL: testv4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -717,16 +683,14 @@ define <4 x i32> @testv4i32(<4 x i32> %i
 ;
 ; AVX512VLBWDQ-LABEL: testv4i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -755,27 +719,24 @@ define <4 x i32> @testv4i32(<4 x i32> %i
 ;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm1
@@ -874,27 +835,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %
 ;
 ; SSSE3-LABEL: testv4i32u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSSE3-NEXT:    psrlw $8, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSSE3-NEXT:    psrlw $8, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
 ; SSSE3-NEXT:    psrlw $8, %xmm1
-; SSSE3-NEXT:    paddw %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSSE3-NEXT:    paddw %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrld $16, %xmm1
@@ -904,27 +862,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %
 ;
 ; SSE41-LABEL: testv4i32u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $16, %xmm1
@@ -934,16 +889,14 @@ define <4 x i32> @testv4i32u(<4 x i32> %
 ;
 ; AVX-LABEL: testv4i32u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -959,16 +912,14 @@ define <4 x i32> @testv4i32u(<4 x i32> %
 ;
 ; AVX512VLBWDQ-LABEL: testv4i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
@@ -997,27 +948,24 @@ define <4 x i32> @testv4i32u(<4 x i32> %
 ;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
-; X32-SSE-NEXT:    paddw %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X32-SSE-NEXT:    paddw %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrld $16, %xmm1
@@ -1104,21 +1052,18 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm1
@@ -1128,21 +1073,18 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm1
@@ -1152,16 +1094,14 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; AVX-LABEL: testv8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1172,16 +1112,14 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; AVX512VLBWDQ-LABEL: testv8i16:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1210,21 +1148,18 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
@@ -1310,21 +1245,18 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrlw $4, %xmm1
-; SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    paddb %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    psrlw $8, %xmm1
@@ -1334,21 +1266,18 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm1, %xmm3
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    psrlw $8, %xmm1
@@ -1358,16 +1287,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; AVX-LABEL: testv8i16u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1378,16 +1305,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; AVX512VLBWDQ-LABEL: testv8i16u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
 ; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -1416,21 +1341,18 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $4, %xmm1
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm1
-; X32-SSE-NEXT:    paddb %xmm3, %xmm1
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm0
 ; X32-SSE-NEXT:    pand %xmm1, %xmm0
 ; X32-SSE-NEXT:    psrlw $8, %xmm1
@@ -1510,68 +1432,58 @@ define <16 x i8> @testv16i8(<16 x i8> %i
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    psrlw $4, %xmm0
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
-; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    paddb %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv16i8:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8:
@@ -1585,19 +1497,16 @@ define <16 x i8> @testv16i8(<16 x i8> %i
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm2
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
@@ -1673,68 +1582,58 @@ define <16 x i8> @testv16i8u(<16 x i8> %
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    psrlw $4, %xmm0
-; SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
-; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    paddb %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv16i8u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512VLBWDQ-NEXT:    vpand %xmm1, %xmm2, %xmm1
-; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX512VLBWDQ-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8u:
@@ -1748,19 +1647,16 @@ define <16 x i8> @testv16i8u(<16 x i8> %
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm2
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
 ; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
-; X32-SSE-NEXT:    paddb %xmm2, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll?rev=347242&r1=347241&r2=347242&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll Mon Nov 19 10:40:59 2018
@@ -13,40 +13,38 @@ define <4 x i64> @testv4i64(<4 x i64> %i
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm6, %xmm7
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddb %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrld $16, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld $16, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm6
-; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
@@ -67,16 +65,14 @@ define <4 x i64> @testv4i64(<4 x i64> %i
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -97,16 +93,14 @@ define <4 x i64> @testv4i64(<4 x i64> %i
 ;
 ; AVX512VL-LABEL: testv4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -127,16 +121,14 @@ define <4 x i64> @testv4i64(<4 x i64> %i
 ;
 ; AVX512VLBWDQ-LABEL: testv4i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -169,16 +161,14 @@ define <4 x i64> @testv4i64(<4 x i64> %i
 ;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -205,40 +195,38 @@ define <4 x i64> @testv4i64u(<4 x i64> %
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm6, %xmm7
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
-; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
+; AVX1-NEXT:    vpaddb %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrld $16, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld $16, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm6
-; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
@@ -259,16 +247,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -289,16 +275,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %
 ;
 ; AVX512VL-LABEL: testv4i64u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -319,16 +303,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %
 ;
 ; AVX512VLBWDQ-LABEL: testv4i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -361,16 +343,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %
 ;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -397,34 +377,32 @@ define <8 x i32> @testv8i32(<8 x i32> %i
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm5
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqw %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
@@ -441,16 +419,14 @@ define <8 x i32> @testv8i32(<8 x i32> %i
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -466,16 +442,14 @@ define <8 x i32> @testv8i32(<8 x i32> %i
 ;
 ; AVX512VL-LABEL: testv8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -491,16 +465,14 @@ define <8 x i32> @testv8i32(<8 x i32> %i
 ;
 ; AVX512VLBWDQ-LABEL: testv8i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -528,16 +500,14 @@ define <8 x i32> @testv8i32(<8 x i32> %i
 ;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -559,34 +529,32 @@ define <8 x i32> @testv8i32u(<8 x i32> %
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm5
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqw %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
@@ -603,16 +571,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -628,16 +594,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %
 ;
 ; AVX512VL-LABEL: testv8i32u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -653,16 +617,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %
 ;
 ; AVX512VLBWDQ-LABEL: testv8i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -690,16 +652,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %
 ;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
@@ -721,29 +681,27 @@ define <16 x i16> @testv16i16(<16 x i16>
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -755,16 +713,14 @@ define <16 x i16> @testv16i16(<16 x i16>
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -775,16 +731,14 @@ define <16 x i16> @testv16i16(<16 x i16>
 ;
 ; AVX512VL-LABEL: testv16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -795,16 +749,14 @@ define <16 x i16> @testv16i16(<16 x i16>
 ;
 ; AVX512VLBWDQ-LABEL: testv16i16:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -823,16 +775,14 @@ define <16 x i16> @testv16i16(<16 x i16>
 ;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -848,29 +798,27 @@ define <16 x i16> @testv16i16u(<16 x i16
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm7
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
-; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm4, %xmm5
 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
@@ -882,16 +830,14 @@ define <16 x i16> @testv16i16u(<16 x i16
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -902,16 +848,14 @@ define <16 x i16> @testv16i16u(<16 x i16
 ;
 ; AVX512VL-LABEL: testv16i16u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -922,16 +866,14 @@ define <16 x i16> @testv16i16u(<16 x i16
 ;
 ; AVX512VLBWDQ-LABEL: testv16i16u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; AVX512VLBWDQ-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; AVX512VLBWDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -950,16 +892,14 @@ define <16 x i16> @testv16i16u(<16 x i16
 ;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; X32-AVX-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm3, %ymm3
 ; X32-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm3, %ymm5
 ; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; X32-AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -975,71 +915,63 @@ define <32 x i8> @testv32i8(<32 x i8> %i
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: testv32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv32i8:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv32i8:
@@ -1059,17 +991,15 @@ define <32 x i8> @testv32i8(<32 x i8> %i
 ;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X32-AVX-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0)
   ret <32 x i8> %out
@@ -1079,71 +1009,63 @@ define <32 x i8> @testv32i8u(<32 x i8> %
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: testv32i8u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: testv32i8u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VLBWDQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VLBWDQ-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; AVX512VLBWDQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; AVX512-LABEL: testv32i8u:
@@ -1163,17 +1085,15 @@ define <32 x i8> @testv32i8u(<32 x i8> %
 ;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X32-AVX-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
+; X32-AVX-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1)
   ret <32 x i8> %out

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll?rev=347242&r1=347241&r2=347242&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll Mon Nov 19 10:40:59 2018
@@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16>
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm4
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm3, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
 ; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -380,29 +379,27 @@ define <32 x i16> @testv32i16(<32 x i16>
 ;
 ; AVX512DQ-LABEL: testv32i16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm5, %ymm7
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm7
 ; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpand %ymm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddw %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm5
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
@@ -445,16 +442,15 @@ define <32 x i16> @testv32i16u(<32 x i16
 ;
 ; AVX512BW-LABEL: testv32i16u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm4
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm4, %zmm3, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
 ; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
@@ -466,29 +462,27 @@ define <32 x i16> @testv32i16u(<32 x i16
 ;
 ; AVX512DQ-LABEL: testv32i16u:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm5, %ymm7
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm7
 ; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vpaddb %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpand %ymm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddw %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm5
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
@@ -555,40 +549,37 @@ define <64 x i8> @testv64i8(<64 x i8> %i
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm6
 ; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    retq
   %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
   ret <64 x i8> %out
@@ -649,40 +640,37 @@ define <64 x i8> @testv64i8u(<64 x i8> %
 ;
 ; AVX512BW-LABEL: testv64i8u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8u:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm6
 ; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vpand %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    retq
   %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
   ret <64 x i8> %out




More information about the llvm-commits mailing list