[llvm] r269646 - [X86][SSSE3] Lower vector CTLZ with PSHUFB lookups

Steven Wu via llvm-commits llvm-commits at lists.llvm.org
Mon May 16 09:03:17 PDT 2016


> On May 16, 2016, at 4:19 AM, Simon Pilgrim via llvm-commits <llvm-commits at lists.llvm.org> wrote:
> 
> Author: rksimon
> Date: Mon May 16 06:19:11 2016
> New Revision: 269646
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=269646&view=rev
> Log:
> [X86][SSSE3] Lower vector CTLZ with PSHUFB lookups
> 
> This patch uses PSHUFB to lower vector CTLZ and avoid (slower) scalarizations.
> 
> The leading zero count of each 4-bit nibble of the vector is determined by using a PSHUFB lookup. Pairs of results are then repeatedly combined up to the original element width.
> 
> Differential Revision: http://reviews.llvm.org/D20016
> 
> Modified:
>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>    llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
>    llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=269646&r1=269645&r2=269646&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon May 16 06:19:11 2016
> @@ -864,6 +864,13 @@ X86TargetLowering::X86TargetLowering(con
>     }
>   }
> 
> +  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
> +    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
> +    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
> +    // ISD::CTLZ v4i32 - scalarization is faster.
> +    // ISD::CTLZ v2i64 - scalarization is faster.
> +  }
> +
>   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
>     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
>       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
> @@ -932,6 +939,8 @@ X86TargetLowering::X86TargetLowering(con
>   }
> 
>   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
> +    bool HasInt256 = Subtarget.hasInt256();
> +
>     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
>     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
>     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
> @@ -998,14 +1007,21 @@ X86TargetLowering::X86TargetLowering(con
>       setOperationAction(ISD::CTTZ,            VT, Custom);
>     }
> 
> +    // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
> +    // as we end up splitting the 256-bit vectors.
> +    for (auto VT : { MVT::v32i8, MVT::v16i16 })
> +      setOperationAction(ISD::CTLZ,            VT, Custom);
> +
> +    if (HasInt256)
> +      for (auto VT : { MVT::v8i32, MVT::v4i64 })
> +        setOperationAction(ISD::CTLZ,          VT, Custom);
> +
>     if (Subtarget.hasAnyFMA()) {
>       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
>                        MVT::v2f64, MVT::v4f64 })
>         setOperationAction(ISD::FMA, VT, Legal);
>     }
> 
> -    bool HasInt256 = Subtarget.hasInt256();
> -
>     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
>       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
>       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
> @@ -18767,7 +18783,105 @@ static SDValue LowerVectorCTLZ_AVX512(SD
>   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
> }
> 
> -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
> +// Lower CTLZ using a PSHUFB lookup table implementation.
> +static SDValue LowerVectorCTLZInRegLUT(SDValue Op, SDLoc DL,
> +                                       const X86Subtarget &Subtarget,
> +                                       SelectionDAG &DAG) {
> +  MVT VT = Op.getSimpleValueType();
> +  MVT SVT = VT.getScalarType();

SVT is not used? This is causing a compiler warning. Can you fix that?

Thanks

Steven

> +  int NumElts = VT.getVectorNumElements();
> +  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
> +  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
> +
> +  // Per-nibble leading zero PSHUFB lookup table.
> +  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
> +                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
> +                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
> +                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
> +
> +  SmallVector<SDValue, 64> LUTVec;
> +  for (int i = 0; i < NumBytes; ++i)
> +    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
> +  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
> +
> +  // Begin by bitcasting the input to byte vector, then split those bytes
> +  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
> +  // If the hi input nibble is zero then we add both results together, otherwise
> +  // we just take the hi result (by masking the lo result to zero before the
> +  // add).
> +  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
> +  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
> +
> +  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
> +  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
> +  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
> +  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
> +  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
> +
> +  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
> +  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
> +  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
> +  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
> +
> +  // Merge result back from vXi8 back to VT, working on the lo/hi halves
> +  // of the current vector width in the same way we did for the nibbles.
> +  // If the upper half of the input element is zero then add the halves'
> +  // leading zero counts together, otherwise just use the upper half's.
> +  // Double the width of the result until we are at target width.
> +  while (CurrVT != VT) {
> +    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
> +    int CurrNumElts = CurrVT.getVectorNumElements();
> +    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
> +    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
> +    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
> +
> +    // Check if the upper half of the input element is zero.
> +    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
> +                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
> +    HiZ = DAG.getBitcast(NextVT, HiZ);
> +
> +    // Move the upper/lower halves to the lower bits as we'll be extending to
> +    // NextVT. Mask the lower result to zero if HiZ is true and add the results
> +    // together.
> +    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
> +    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
> +    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
> +    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
> +    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
> +    CurrVT = NextVT;
> +  }
> +
> +  return Res;
> +}
> +
> +static SDValue LowerVectorCTLZ(SDValue Op, SDLoc DL,
> +                               const X86Subtarget &Subtarget,
> +                               SelectionDAG &DAG) {
> +  MVT VT = Op.getSimpleValueType();
> +  SDValue Op0 = Op.getOperand(0);
> +
> +  if (Subtarget.hasAVX512())
> +    return LowerVectorCTLZ_AVX512(Op, DAG);
> +
> +  // Decompose 256-bit ops into smaller 128-bit ops.
> +  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
> +    unsigned NumElems = VT.getVectorNumElements();
> +
> +    // Extract each 128-bit vector, perform ctlz and concat the result.
> +    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
> +    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
> +
> +    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
> +                       DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
> +                       DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
> +  }
> +
> +  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
> +  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
> +}
> +
> +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
> +                         SelectionDAG &DAG) {
>   MVT VT = Op.getSimpleValueType();
>   MVT OpVT = VT;
>   unsigned NumBits = VT.getSizeInBits();
> @@ -18775,7 +18889,7 @@ static SDValue LowerCTLZ(SDValue Op, Sel
>   unsigned Opc = Op.getOpcode();
> 
>   if (VT.isVector())
> -    return LowerVectorCTLZ_AVX512(Op, DAG);
> +    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
> 
>   Op = Op.getOperand(0);
>   if (VT == MVT::i8) {
> @@ -21304,7 +21418,7 @@ SDValue X86TargetLowering::LowerOperatio
>   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
>   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
>   case ISD::CTLZ:
> -  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, DAG);
> +  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
>   case ISD::CTTZ:
>   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
>   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
> 
> Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll?rev=269646&r1=269645&r2=269646&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll Mon May 16 06:19:11 2016
> @@ -706,145 +706,70 @@ define <8 x i16> @testv8i16(<8 x i16> %i
> ;
> ; SSSE3-LABEL: testv8i16:
> ; SSSE3:       # BB#0:
> -; SSSE3-NEXT:    pextrw $7, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %cx
> -; SSSE3-NEXT:    movw $31, %ax
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm1
> -; SSSE3-NEXT:    pextrw $3, %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm2
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
> -; SSSE3-NEXT:    pextrw $5, %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm3
> -; SSSE3-NEXT:    pextrw $1, %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm1
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
> -; SSSE3-NEXT:    pextrw $6, %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm2
> -; SSSE3-NEXT:    pextrw $2, %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm3
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
> -; SSSE3-NEXT:    pextrw $4, %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm2
> -; SSSE3-NEXT:    movd %xmm0, %ecx
> -; SSSE3-NEXT:    bsrw %cx, %cx
> -; SSSE3-NEXT:    cmovew %ax, %cx
> -; SSSE3-NEXT:    xorl $15, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT:    movdqa %xmm0, %xmm1
> +; SSSE3-NEXT:    pand %xmm2, %xmm1
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT:    movdqa %xmm3, %xmm4
> +; SSSE3-NEXT:    pshufb %xmm1, %xmm4
> +; SSSE3-NEXT:    movdqa %xmm0, %xmm1
> +; SSSE3-NEXT:    psrlw $4, %xmm1
> +; SSSE3-NEXT:    pand %xmm2, %xmm1
> +; SSSE3-NEXT:    pxor %xmm2, %xmm2
> +; SSSE3-NEXT:    pshufb %xmm1, %xmm3
> +; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
> +; SSSE3-NEXT:    pand %xmm4, %xmm1
> +; SSSE3-NEXT:    paddb %xmm3, %xmm1
> +; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
> +; SSSE3-NEXT:    psrlw $8, %xmm0
> +; SSSE3-NEXT:    pand %xmm1, %xmm0
> +; SSSE3-NEXT:    psrlw $8, %xmm1
> +; SSSE3-NEXT:    paddw %xmm0, %xmm1
> +; SSSE3-NEXT:    movdqa %xmm1, %xmm0
> ; SSSE3-NEXT:    retq
> ;
> ; SSE41-LABEL: testv8i16:
> ; SSE41:       # BB#0:
> -; SSE41-NEXT:    pextrw $1, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %cx
> -; SSE41-NEXT:    movw $31, %ax
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    movd %xmm0, %edx
> -; SSE41-NEXT:    bsrw %dx, %dx
> -; SSE41-NEXT:    cmovew %ax, %dx
> -; SSE41-NEXT:    xorl $15, %edx
> -; SSE41-NEXT:    movd %edx, %xmm1
> -; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
> -; SSE41-NEXT:    pextrw $2, %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
> -; SSE41-NEXT:    pextrw $3, %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
> -; SSE41-NEXT:    pextrw $4, %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
> -; SSE41-NEXT:    pextrw $5, %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
> -; SSE41-NEXT:    pextrw $6, %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
> -; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    cmovew %ax, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    pinsrw $7, %ecx, %xmm1
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT:    movdqa %xmm0, %xmm1
> +; SSE41-NEXT:    pand %xmm2, %xmm1
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT:    movdqa %xmm3, %xmm4
> +; SSE41-NEXT:    pshufb %xmm1, %xmm4
> +; SSE41-NEXT:    movdqa %xmm0, %xmm1
> +; SSE41-NEXT:    psrlw $4, %xmm1
> +; SSE41-NEXT:    pand %xmm2, %xmm1
> +; SSE41-NEXT:    pxor %xmm2, %xmm2
> +; SSE41-NEXT:    pshufb %xmm1, %xmm3
> +; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
> +; SSE41-NEXT:    pand %xmm4, %xmm1
> +; SSE41-NEXT:    paddb %xmm3, %xmm1
> +; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
> +; SSE41-NEXT:    psrlw $8, %xmm0
> +; SSE41-NEXT:    pand %xmm1, %xmm0
> +; SSE41-NEXT:    psrlw $8, %xmm1
> +; SSE41-NEXT:    paddw %xmm0, %xmm1
> ; SSE41-NEXT:    movdqa %xmm1, %xmm0
> ; SSE41-NEXT:    retq
> ;
> ; AVX-LABEL: testv8i16:
> ; AVX:       # BB#0:
> -; AVX-NEXT:    vpextrw $1, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %cx
> -; AVX-NEXT:    movw $31, %ax
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vmovd %xmm0, %edx
> -; AVX-NEXT:    bsrw %dx, %dx
> -; AVX-NEXT:    cmovew %ax, %dx
> -; AVX-NEXT:    xorl $15, %edx
> -; AVX-NEXT:    vmovd %edx, %xmm1
> -; AVX-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $2, %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $3, %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $4, %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $5, %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $6, %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    cmovew %ax, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm0
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
> +; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
> +; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
> +; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
> +; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
> +; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
> +; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
> +; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
> +; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
> +; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
> +; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
> +; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
> ; AVX-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv8i16:
> @@ -865,47 +790,25 @@ define <8 x i16> @testv8i16(<8 x i16> %i
> ;
> ; X32-SSE-LABEL: testv8i16:
> ; X32-SSE:       # BB#0:
> -; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %cx
> -; X32-SSE-NEXT:    movw $31, %ax
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    movd %xmm0, %edx
> -; X32-SSE-NEXT:    bsrw %dx, %dx
> -; X32-SSE-NEXT:    cmovew %ax, %dx
> -; X32-SSE-NEXT:    xorl $15, %edx
> -; X32-SSE-NEXT:    movd %edx, %xmm1
> -; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrw $2, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrw $3, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrw $4, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrw $5, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrw $6, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrw $7, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    cmovew %ax, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    pinsrw $7, %ecx, %xmm1
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT:    pand %xmm2, %xmm1
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
> +; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
> +; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT:    psrlw $4, %xmm1
> +; X32-SSE-NEXT:    pand %xmm2, %xmm1
> +; X32-SSE-NEXT:    pxor %xmm2, %xmm2
> +; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
> +; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
> +; X32-SSE-NEXT:    pand %xmm4, %xmm1
> +; X32-SSE-NEXT:    paddb %xmm3, %xmm1
> +; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
> +; X32-SSE-NEXT:    psrlw $8, %xmm0
> +; X32-SSE-NEXT:    pand %xmm1, %xmm0
> +; X32-SSE-NEXT:    psrlw $8, %xmm1
> +; X32-SSE-NEXT:    paddw %xmm0, %xmm1
> ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT:    retl
>   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
> @@ -1001,118 +904,70 @@ define <8 x i16> @testv8i16u(<8 x i16> %
> ;
> ; SSSE3-LABEL: testv8i16u:
> ; SSSE3:       # BB#0:
> -; SSSE3-NEXT:    pextrw $7, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm1
> -; SSSE3-NEXT:    pextrw $3, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm2
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
> -; SSSE3-NEXT:    pextrw $5, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm3
> -; SSSE3-NEXT:    pextrw $1, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm1
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
> -; SSSE3-NEXT:    pextrw $6, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm2
> -; SSSE3-NEXT:    pextrw $2, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm3
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
> -; SSSE3-NEXT:    pextrw $4, %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm2
> -; SSSE3-NEXT:    movd %xmm0, %eax
> -; SSSE3-NEXT:    bsrw %ax, %ax
> -; SSSE3-NEXT:    xorl $15, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm0
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
> -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT:    movdqa %xmm0, %xmm1
> +; SSSE3-NEXT:    pand %xmm2, %xmm1
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT:    movdqa %xmm3, %xmm4
> +; SSSE3-NEXT:    pshufb %xmm1, %xmm4
> +; SSSE3-NEXT:    movdqa %xmm0, %xmm1
> +; SSSE3-NEXT:    psrlw $4, %xmm1
> +; SSSE3-NEXT:    pand %xmm2, %xmm1
> +; SSSE3-NEXT:    pxor %xmm2, %xmm2
> +; SSSE3-NEXT:    pshufb %xmm1, %xmm3
> +; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
> +; SSSE3-NEXT:    pand %xmm4, %xmm1
> +; SSSE3-NEXT:    paddb %xmm3, %xmm1
> +; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
> +; SSSE3-NEXT:    psrlw $8, %xmm0
> +; SSSE3-NEXT:    pand %xmm1, %xmm0
> +; SSSE3-NEXT:    psrlw $8, %xmm1
> +; SSSE3-NEXT:    paddw %xmm0, %xmm1
> +; SSSE3-NEXT:    movdqa %xmm1, %xmm0
> ; SSSE3-NEXT:    retq
> ;
> ; SSE41-LABEL: testv8i16u:
> ; SSE41:       # BB#0:
> -; SSE41-NEXT:    pextrw $1, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    movd %xmm0, %ecx
> -; SSE41-NEXT:    bsrw %cx, %cx
> -; SSE41-NEXT:    xorl $15, %ecx
> -; SSE41-NEXT:    movd %ecx, %xmm1
> -; SSE41-NEXT:    pinsrw $1, %eax, %xmm1
> -; SSE41-NEXT:    pextrw $2, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    pinsrw $2, %eax, %xmm1
> -; SSE41-NEXT:    pextrw $3, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    pinsrw $3, %eax, %xmm1
> -; SSE41-NEXT:    pextrw $4, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    pinsrw $4, %eax, %xmm1
> -; SSE41-NEXT:    pextrw $5, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    pinsrw $5, %eax, %xmm1
> -; SSE41-NEXT:    pextrw $6, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    pinsrw $6, %eax, %xmm1
> -; SSE41-NEXT:    pextrw $7, %xmm0, %eax
> -; SSE41-NEXT:    bsrw %ax, %ax
> -; SSE41-NEXT:    xorl $15, %eax
> -; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT:    movdqa %xmm0, %xmm1
> +; SSE41-NEXT:    pand %xmm2, %xmm1
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT:    movdqa %xmm3, %xmm4
> +; SSE41-NEXT:    pshufb %xmm1, %xmm4
> +; SSE41-NEXT:    movdqa %xmm0, %xmm1
> +; SSE41-NEXT:    psrlw $4, %xmm1
> +; SSE41-NEXT:    pand %xmm2, %xmm1
> +; SSE41-NEXT:    pxor %xmm2, %xmm2
> +; SSE41-NEXT:    pshufb %xmm1, %xmm3
> +; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
> +; SSE41-NEXT:    pand %xmm4, %xmm1
> +; SSE41-NEXT:    paddb %xmm3, %xmm1
> +; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
> +; SSE41-NEXT:    psrlw $8, %xmm0
> +; SSE41-NEXT:    pand %xmm1, %xmm0
> +; SSE41-NEXT:    psrlw $8, %xmm1
> +; SSE41-NEXT:    paddw %xmm0, %xmm1
> ; SSE41-NEXT:    movdqa %xmm1, %xmm0
> ; SSE41-NEXT:    retq
> ;
> ; AVX-LABEL: testv8i16u:
> ; AVX:       # BB#0:
> -; AVX-NEXT:    vpextrw $1, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vmovd %xmm0, %ecx
> -; AVX-NEXT:    bsrw %cx, %cx
> -; AVX-NEXT:    xorl $15, %ecx
> -; AVX-NEXT:    vmovd %ecx, %xmm1
> -; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $2, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $3, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $4, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $5, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $6, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrw $7, %xmm0, %eax
> -; AVX-NEXT:    bsrw %ax, %ax
> -; AVX-NEXT:    xorl $15, %eax
> -; AVX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
> +; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
> +; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
> +; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
> +; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
> +; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
> +; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
> +; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
> +; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
> +; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
> +; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
> +; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
> ; AVX-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv8i16u:
> @@ -1133,38 +988,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %
> ;
> ; X32-SSE-LABEL: testv8i16u:
> ; X32-SSE:       # BB#0:
> -; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    movd %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrw %cx, %cx
> -; X32-SSE-NEXT:    xorl $15, %ecx
> -; X32-SSE-NEXT:    movd %ecx, %xmm1
> -; X32-SSE-NEXT:    pinsrw $1, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    pinsrw $2, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    pinsrw $3, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    pinsrw $4, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    pinsrw $6, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrw %ax, %ax
> -; X32-SSE-NEXT:    xorl $15, %eax
> -; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT:    pand %xmm2, %xmm1
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
> +; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
> +; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT:    psrlw $4, %xmm1
> +; X32-SSE-NEXT:    pand %xmm2, %xmm1
> +; X32-SSE-NEXT:    pxor %xmm2, %xmm2
> +; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
> +; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
> +; X32-SSE-NEXT:    pand %xmm4, %xmm1
> +; X32-SSE-NEXT:    paddb %xmm3, %xmm1
> +; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
> +; X32-SSE-NEXT:    psrlw $8, %xmm0
> +; X32-SSE-NEXT:    pand %xmm1, %xmm0
> +; X32-SSE-NEXT:    psrlw $8, %xmm1
> +; X32-SSE-NEXT:    paddw %xmm0, %xmm1
> ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT:    retl
>   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
> @@ -1384,278 +1226,53 @@ define <16 x i8> @testv16i8(<16 x i8> %i
> ;
> ; SSSE3-LABEL: testv16i8:
> ; SSSE3:       # BB#0:
> -; SSSE3-NEXT:    pushq %rbp
> -; SSSE3-NEXT:    pushq %rbx
> -; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT:    bsrl %eax, %ecx
> -; SSSE3-NEXT:    movl $15, %eax
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT:    bsrl %ecx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm1
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %edx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm2
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
> -; SSSE3-NEXT:    bsrl %ebp, %ebp
> -; SSSE3-NEXT:    cmovel %eax, %ebp
> -; SSSE3-NEXT:    xorl $7, %ebp
> -; SSSE3-NEXT:    movd %ebp, %xmm0
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
> -; SSSE3-NEXT:    bsrl %edi, %edi
> -; SSSE3-NEXT:    cmovel %eax, %edi
> -; SSSE3-NEXT:    xorl $7, %edi
> -; SSSE3-NEXT:    movd %edi, %xmm1
> -; SSSE3-NEXT:    bsrl %ecx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm2
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
> -; SSSE3-NEXT:    bsrl %esi, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm3
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT:    bsrl %ecx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm1
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %ebx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    bsrl %edx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm3
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %r11d, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    bsrl %esi, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm2
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
> -; SSSE3-NEXT:    bsrl %r9d, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    bsrl %r10d, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm3
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %r8d, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm4
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT:    bsrl %ecx, %ecx
> -; SSSE3-NEXT:    cmovel %eax, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
> -; SSSE3-NEXT:    popq %rbx
> -; SSSE3-NEXT:    popq %rbp
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT:    movdqa %xmm0, %xmm3
> +; SSSE3-NEXT:    pand %xmm2, %xmm3
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT:    movdqa %xmm1, %xmm4
> +; SSSE3-NEXT:    pshufb %xmm3, %xmm4
> +; SSSE3-NEXT:    psrlw $4, %xmm0
> +; SSSE3-NEXT:    pand %xmm2, %xmm0
> +; SSSE3-NEXT:    pxor %xmm2, %xmm2
> +; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
> +; SSSE3-NEXT:    pand %xmm4, %xmm2
> +; SSSE3-NEXT:    pshufb %xmm0, %xmm1
> +; SSSE3-NEXT:    paddb %xmm2, %xmm1
> +; SSSE3-NEXT:    movdqa %xmm1, %xmm0
> ; SSSE3-NEXT:    retq
> ;
> ; SSE41-LABEL: testv16i8:
> ; SSE41:       # BB#0:
> -; SSE41-NEXT:    pextrb $1, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %ecx
> -; SSE41-NEXT:    movl $15, %eax
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pextrb $0, %xmm0, %edx
> -; SSE41-NEXT:    bsrl %edx, %edx
> -; SSE41-NEXT:    cmovel %eax, %edx
> -; SSE41-NEXT:    xorl $7, %edx
> -; SSE41-NEXT:    movd %edx, %xmm1
> -; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $2, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $3, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $4, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $5, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $6, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $7, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $8, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $9, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $10, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $11, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $12, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $13, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $14, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
> -; SSE41-NEXT:    pextrb $15, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    cmovel %eax, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    pinsrb $15, %ecx, %xmm1
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT:    movdqa %xmm0, %xmm3
> +; SSE41-NEXT:    pand %xmm2, %xmm3
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT:    movdqa %xmm1, %xmm4
> +; SSE41-NEXT:    pshufb %xmm3, %xmm4
> +; SSE41-NEXT:    psrlw $4, %xmm0
> +; SSE41-NEXT:    pand %xmm2, %xmm0
> +; SSE41-NEXT:    pxor %xmm2, %xmm2
> +; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
> +; SSE41-NEXT:    pand %xmm4, %xmm2
> +; SSE41-NEXT:    pshufb %xmm0, %xmm1
> +; SSE41-NEXT:    paddb %xmm2, %xmm1
> ; SSE41-NEXT:    movdqa %xmm1, %xmm0
> ; SSE41-NEXT:    retq
> ;
> ; AVX-LABEL: testv16i8:
> ; AVX:       # BB#0:
> -; AVX-NEXT:    vpextrb $1, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %ecx
> -; AVX-NEXT:    movl $15, %eax
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpextrb $0, %xmm0, %edx
> -; AVX-NEXT:    bsrl %edx, %edx
> -; AVX-NEXT:    cmovel %eax, %edx
> -; AVX-NEXT:    xorl $7, %edx
> -; AVX-NEXT:    vmovd %edx, %xmm1
> -; AVX-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $2, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $3, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $4, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $5, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $6, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $7, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $8, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $9, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $10, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $11, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $12, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $13, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $14, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $15, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    cmovel %eax, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vpinsrb $15, %ecx, %xmm1, %xmm0
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
> +; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
> +; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
> +; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
> +; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
> +; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
> +; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
> ; AVX-NEXT:    retq
> ;
> ; AVX512-LABEL: testv16i8:
> @@ -1668,87 +1285,19 @@ define <16 x i8> @testv16i8(<16 x i8> %i
> ;
> ; X32-SSE-LABEL: testv16i8:
> ; X32-SSE:       # BB#0:
> -; X32-SSE-NEXT:    pextrb $1, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %ecx
> -; X32-SSE-NEXT:    movl $15, %eax
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pextrb $0, %xmm0, %edx
> -; X32-SSE-NEXT:    bsrl %edx, %edx
> -; X32-SSE-NEXT:    cmovel %eax, %edx
> -; X32-SSE-NEXT:    xorl $7, %edx
> -; X32-SSE-NEXT:    movd %edx, %xmm1
> -; X32-SSE-NEXT:    pinsrb $1, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $2, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $2, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $3, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $3, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $4, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $4, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $5, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $5, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $6, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $6, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $7, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $7, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $8, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $8, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $9, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $9, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $10, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $10, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $11, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $11, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $12, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $12, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $13, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $13, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $14, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $14, %ecx, %xmm1
> -; X32-SSE-NEXT:    pextrb $15, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    cmovel %eax, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    pinsrb $15, %ecx, %xmm1
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
> +; X32-SSE-NEXT:    pand %xmm2, %xmm3
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
> +; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
> +; X32-SSE-NEXT:    psrlw $4, %xmm0
> +; X32-SSE-NEXT:    pand %xmm2, %xmm0
> +; X32-SSE-NEXT:    pxor %xmm2, %xmm2
> +; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
> +; X32-SSE-NEXT:    pand %xmm4, %xmm2
> +; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
> +; X32-SSE-NEXT:    paddb %xmm2, %xmm1
> ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT:    retl
>   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
> @@ -1930,225 +1479,53 @@ define <16 x i8> @testv16i8u(<16 x i8> %
> ;
> ; SSSE3-LABEL: testv16i8u:
> ; SSSE3:       # BB#0:
> -; SSSE3-NEXT:    pushq %rbx
> -; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT:    bsrl %eax, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm0
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT:    bsrl %esi, %esi
> -; SSSE3-NEXT:    xorl $7, %esi
> -; SSSE3-NEXT:    movd %esi, %xmm1
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %eax, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm0
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
> -; SSSE3-NEXT:    bsrl %ebx, %ebx
> -; SSSE3-NEXT:    xorl $7, %ebx
> -; SSSE3-NEXT:    movd %ebx, %xmm2
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
> -; SSSE3-NEXT:    bsrl %edx, %edx
> -; SSSE3-NEXT:    xorl $7, %edx
> -; SSSE3-NEXT:    movd %edx, %xmm0
> -; SSSE3-NEXT:    bsrl %esi, %edx
> -; SSSE3-NEXT:    xorl $7, %edx
> -; SSSE3-NEXT:    movd %edx, %xmm3
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %ecx, %ecx
> -; SSSE3-NEXT:    xorl $7, %ecx
> -; SSSE3-NEXT:    movd %ecx, %xmm0
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT:    bsrl %edx, %edx
> -; SSSE3-NEXT:    xorl $7, %edx
> -; SSSE3-NEXT:    movd %edx, %xmm1
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
> -; SSSE3-NEXT:    bsrl %edi, %edx
> -; SSSE3-NEXT:    xorl $7, %edx
> -; SSSE3-NEXT:    movd %edx, %xmm0
> -; SSSE3-NEXT:    bsrl %eax, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm2
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %r10d, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm0
> -; SSSE3-NEXT:    bsrl %ecx, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm3
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
> -; SSSE3-NEXT:    bsrl %r9d, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm0
> -; SSSE3-NEXT:    bsrl %r11d, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm2
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT:    bsrl %r8d, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm4
> -; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT:    bsrl %eax, %eax
> -; SSSE3-NEXT:    xorl $7, %eax
> -; SSSE3-NEXT:    movd %eax, %xmm0
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
> -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
> -; SSSE3-NEXT:    popq %rbx
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT:    movdqa %xmm0, %xmm3
> +; SSSE3-NEXT:    pand %xmm2, %xmm3
> +; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT:    movdqa %xmm1, %xmm4
> +; SSSE3-NEXT:    pshufb %xmm3, %xmm4
> +; SSSE3-NEXT:    psrlw $4, %xmm0
> +; SSSE3-NEXT:    pand %xmm2, %xmm0
> +; SSSE3-NEXT:    pxor %xmm2, %xmm2
> +; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
> +; SSSE3-NEXT:    pand %xmm4, %xmm2
> +; SSSE3-NEXT:    pshufb %xmm0, %xmm1
> +; SSSE3-NEXT:    paddb %xmm2, %xmm1
> +; SSSE3-NEXT:    movdqa %xmm1, %xmm0
> ; SSSE3-NEXT:    retq
> ;
> ; SSE41-LABEL: testv16i8u:
> ; SSE41:       # BB#0:
> -; SSE41-NEXT:    pextrb $1, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pextrb $0, %xmm0, %ecx
> -; SSE41-NEXT:    bsrl %ecx, %ecx
> -; SSE41-NEXT:    xorl $7, %ecx
> -; SSE41-NEXT:    movd %ecx, %xmm1
> -; SSE41-NEXT:    pinsrb $1, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $2, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $2, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $3, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $3, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $4, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $4, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $5, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $5, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $6, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $6, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $7, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $7, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $8, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $8, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $9, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $9, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $10, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $10, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $11, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $11, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $12, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $13, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $13, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $14, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
> -; SSE41-NEXT:    pextrb $15, %xmm0, %eax
> -; SSE41-NEXT:    bsrl %eax, %eax
> -; SSE41-NEXT:    xorl $7, %eax
> -; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT:    movdqa %xmm0, %xmm3
> +; SSE41-NEXT:    pand %xmm2, %xmm3
> +; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT:    movdqa %xmm1, %xmm4
> +; SSE41-NEXT:    pshufb %xmm3, %xmm4
> +; SSE41-NEXT:    psrlw $4, %xmm0
> +; SSE41-NEXT:    pand %xmm2, %xmm0
> +; SSE41-NEXT:    pxor %xmm2, %xmm2
> +; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
> +; SSE41-NEXT:    pand %xmm4, %xmm2
> +; SSE41-NEXT:    pshufb %xmm0, %xmm1
> +; SSE41-NEXT:    paddb %xmm2, %xmm1
> ; SSE41-NEXT:    movdqa %xmm1, %xmm0
> ; SSE41-NEXT:    retq
> ;
> ; AVX-LABEL: testv16i8u:
> ; AVX:       # BB#0:
> -; AVX-NEXT:    vpextrb $1, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpextrb $0, %xmm0, %ecx
> -; AVX-NEXT:    bsrl %ecx, %ecx
> -; AVX-NEXT:    xorl $7, %ecx
> -; AVX-NEXT:    vmovd %ecx, %xmm1
> -; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $2, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $3, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $4, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $5, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $6, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $7, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $8, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $9, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $10, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $11, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $12, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $13, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $14, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
> -; AVX-NEXT:    vpextrb $15, %xmm0, %eax
> -; AVX-NEXT:    bsrl %eax, %eax
> -; AVX-NEXT:    xorl $7, %eax
> -; AVX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
> +; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
> +; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
> +; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
> +; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
> +; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
> +; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
> ; AVX-NEXT:    retq
> ;
> ; AVX512-LABEL: testv16i8u:
> @@ -2161,70 +1538,19 @@ define <16 x i8> @testv16i8u(<16 x i8> %
> ;
> ; X32-SSE-LABEL: testv16i8u:
> ; X32-SSE:       # BB#0:
> -; X32-SSE-NEXT:    pextrb $1, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pextrb $0, %xmm0, %ecx
> -; X32-SSE-NEXT:    bsrl %ecx, %ecx
> -; X32-SSE-NEXT:    xorl $7, %ecx
> -; X32-SSE-NEXT:    movd %ecx, %xmm1
> -; X32-SSE-NEXT:    pinsrb $1, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $2, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $2, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $3, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $3, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $4, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $4, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $5, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $5, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $6, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $6, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $7, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $7, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $8, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $8, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $9, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $9, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $10, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $10, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $11, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $11, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $12, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $12, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $13, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $13, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $14, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $14, %eax, %xmm1
> -; X32-SSE-NEXT:    pextrb $15, %xmm0, %eax
> -; X32-SSE-NEXT:    bsrl %eax, %eax
> -; X32-SSE-NEXT:    xorl $7, %eax
> -; X32-SSE-NEXT:    pinsrb $15, %eax, %xmm1
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
> +; X32-SSE-NEXT:    pand %xmm2, %xmm3
> +; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
> +; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
> +; X32-SSE-NEXT:    psrlw $4, %xmm0
> +; X32-SSE-NEXT:    pand %xmm2, %xmm0
> +; X32-SSE-NEXT:    pxor %xmm2, %xmm2
> +; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
> +; X32-SSE-NEXT:    pand %xmm4, %xmm2
> +; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
> +; X32-SSE-NEXT:    paddb %xmm2, %xmm1
> ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT:    retl
>   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
> 
> Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll?rev=269646&r1=269645&r2=269646&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll Mon May 16 06:19:11 2016
> @@ -35,30 +35,32 @@ define <4 x i64> @testv4i64(<4 x i64> %i
> ;
> ; AVX2-LABEL: testv4i64:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    movl $127, %ecx
> -; AVX2-NEXT:    cmoveq %rcx, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm2
> -; AVX2-NEXT:    vmovq %xmm1, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    cmoveq %rcx, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm1
> -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
> -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
> -; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    cmoveq %rcx, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm3
> -; AVX2-NEXT:    vmovq %xmm0, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    cmoveq %rcx, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm0
> -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
> -; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
> +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv4i64:
> @@ -101,25 +103,32 @@ define <4 x i64> @testv4i64u(<4 x i64> %
> ;
> ; AVX2-LABEL: testv4i64u:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm2
> -; AVX2-NEXT:    vmovq %xmm1, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm1
> -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
> -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
> -; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm3
> -; AVX2-NEXT:    vmovq %xmm0, %rax
> -; AVX2-NEXT:    bsrq %rax, %rax
> -; AVX2-NEXT:    vmovq %rax, %xmm0
> -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
> -; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
> +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv4i64u:
> @@ -181,44 +190,27 @@ define <8 x i32> @testv8i32(<8 x i32> %i
> ;
> ; AVX2-LABEL: testv8i32:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %ecx
> -; AVX2-NEXT:    movl $63, %eax
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    vmovd %xmm1, %edx
> -; AVX2-NEXT:    bsrl %edx, %edx
> -; AVX2-NEXT:    cmovel %eax, %edx
> -; AVX2-NEXT:    vmovd %edx, %xmm2
> -; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm1
> -; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
> -; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    vmovd %xmm0, %edx
> -; AVX2-NEXT:    bsrl %edx, %edx
> -; AVX2-NEXT:    cmovel %eax, %edx
> -; AVX2-NEXT:    vmovd %edx, %xmm3
> -; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm3, %xmm0
> -; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv8i32:
> @@ -271,35 +263,27 @@ define <8 x i32> @testv8i32u(<8 x i32> %
> ;
> ; AVX2-LABEL: testv8i32u:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    vmovd %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    vmovd %ecx, %xmm2
> -; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrd $2, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
> -; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
> -; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    vmovd %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    vmovd %ecx, %xmm3
> -; AVX2-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
> -; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv8i32u:
> @@ -320,149 +304,56 @@ define <16 x i16> @testv16i16(<16 x i16>
> ; AVX1-LABEL: testv16i16:
> ; AVX1:       # BB#0:
> ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT:    vpextrw $1, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %cx
> -; AVX1-NEXT:    movw $31, %ax
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vmovd %xmm1, %edx
> -; AVX1-NEXT:    bsrw %dx, %dx
> -; AVX1-NEXT:    cmovew %ax, %dx
> -; AVX1-NEXT:    vmovd %edx, %xmm2
> -; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $2, %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $3, %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $4, %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $5, %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $6, %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $7, %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm1
> -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX1-NEXT:    vpextrw $1, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vmovd %xmm0, %edx
> -; AVX1-NEXT:    bsrw %dx, %dx
> -; AVX1-NEXT:    cmovew %ax, %dx
> -; AVX1-NEXT:    vmovd %edx, %xmm3
> -; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $2, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $3, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $4, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $5, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $6, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $7, %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    cmovew %ax, %cx
> -; AVX1-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm0
> -; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
> +; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
> +; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
> +; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
> +; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
> +; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
> +; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
> +; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
> +; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
> +; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
> +; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
> +; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
> +; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
> +; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
> +; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT:    retq
> ;
> ; AVX2-LABEL: testv16i16:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %cx
> -; AVX2-NEXT:    movw $31, %ax
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vmovd %xmm1, %edx
> -; AVX2-NEXT:    bsrw %dx, %dx
> -; AVX2-NEXT:    cmovew %ax, %dx
> -; AVX2-NEXT:    vmovd %edx, %xmm2
> -; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $3, %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $4, %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $5, %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $7, %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm1
> -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT:    vpextrw $1, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vmovd %xmm0, %edx
> -; AVX2-NEXT:    bsrw %dx, %dx
> -; AVX2-NEXT:    cmovew %ax, %dx
> -; AVX2-NEXT:    vmovd %edx, %xmm3
> -; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $2, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $3, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $4, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $5, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $6, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $7, %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    cmovew %ax, %cx
> -; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm0
> -; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512-LABEL: testv16i16:
> @@ -480,115 +371,56 @@ define <16 x i16> @testv16i16u(<16 x i16
> ; AVX1-LABEL: testv16i16u:
> ; AVX1:       # BB#0:
> ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT:    vpextrw $1, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vmovd %xmm1, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    vmovd %ecx, %xmm2
> -; AVX1-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $2, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $3, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $4, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $5, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $6, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrw $7, %xmm1, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm1
> -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX1-NEXT:    vpextrw $1, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vmovd %xmm0, %ecx
> -; AVX1-NEXT:    bsrw %cx, %cx
> -; AVX1-NEXT:    vmovd %ecx, %xmm3
> -; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $2, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $3, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $4, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $5, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $6, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
> -; AVX1-NEXT:    vpextrw $7, %xmm0, %eax
> -; AVX1-NEXT:    bsrw %ax, %ax
> -; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm0
> -; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
> +; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
> +; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
> +; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
> +; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
> +; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
> +; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
> +; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
> +; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
> +; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
> +; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
> +; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
> +; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
> +; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
> +; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT:    retq
> ;
> ; AVX2-LABEL: testv16i16u:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vmovd %xmm1, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    vmovd %ecx, %xmm2
> -; AVX2-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $2, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $3, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $4, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $5, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $6, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrw $7, %xmm1, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm1
> -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vmovd %xmm0, %ecx
> -; AVX2-NEXT:    bsrw %cx, %cx
> -; AVX2-NEXT:    vmovd %ecx, %xmm3
> -; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
> -; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
> -; AVX2-NEXT:    bsrw %ax, %ax
> -; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm0
> -; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512-LABEL: testv16i16u:
> @@ -606,335 +438,41 @@ define <32 x i8> @testv32i8(<32 x i8> %i
> ; AVX1-LABEL: testv32i8:
> ; AVX1:       # BB#0:
> ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %ecx
> -; AVX1-NEXT:    movl $15, %eax
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
> -; AVX1-NEXT:    bsrl %edx, %edx
> -; AVX1-NEXT:    cmovel %eax, %edx
> -; AVX1-NEXT:    xorl $7, %edx
> -; AVX1-NEXT:    vmovd %edx, %xmm2
> -; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $2, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $3, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $4, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $5, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $6, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $7, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $8, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $9, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $10, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $11, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $12, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $13, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $14, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $15, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
> -; AVX1-NEXT:    vpextrb $1, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpextrb $0, %xmm0, %edx
> -; AVX1-NEXT:    bsrl %edx, %edx
> -; AVX1-NEXT:    cmovel %eax, %edx
> -; AVX1-NEXT:    xorl $7, %edx
> -; AVX1-NEXT:    vmovd %edx, %xmm2
> -; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $3, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $5, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $7, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $9, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $11, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $13, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $15, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    cmovel %eax, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm0
> +; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
> +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
> +; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
> +; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
> +; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
> +; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
> +; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
> +; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
> +; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT:    retq
> ;
> ; AVX2-LABEL: testv32i8:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %ecx
> -; AVX2-NEXT:    movl $15, %eax
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
> -; AVX2-NEXT:    bsrl %edx, %edx
> -; AVX2-NEXT:    cmovel %eax, %edx
> -; AVX2-NEXT:    xorl $7, %edx
> -; AVX2-NEXT:    vmovd %edx, %xmm2
> -; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
> -; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
> -; AVX2-NEXT:    bsrl %edx, %edx
> -; AVX2-NEXT:    cmovel %eax, %edx
> -; AVX2-NEXT:    xorl $7, %edx
> -; AVX2-NEXT:    vmovd %edx, %xmm2
> -; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $5, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $7, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $9, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $11, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $13, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    cmovel %eax, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
> +; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
> +; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv32i8:
> @@ -974,269 +512,41 @@ define <32 x i8> @testv32i8u(<32 x i8> %
> ; AVX1-LABEL: testv32i8u:
> ; AVX1:       # BB#0:
> ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vmovd %ecx, %xmm2
> -; AVX1-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $2, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $3, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $4, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $6, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $7, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $9, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $10, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $11, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $13, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $14, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $15, %xmm1, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
> -; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
> -; AVX1-NEXT:    bsrl %ecx, %ecx
> -; AVX1-NEXT:    xorl $7, %ecx
> -; AVX1-NEXT:    vmovd %ecx, %xmm2
> -; AVX1-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $2, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $3, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $4, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $6, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $7, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $9, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $10, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $11, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $12, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $13, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $14, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX1-NEXT:    vpextrb $15, %xmm0, %eax
> -; AVX1-NEXT:    bsrl %eax, %eax
> -; AVX1-NEXT:    xorl $7, %eax
> -; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
> +; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
> +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
> +; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
> +; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
> +; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
> +; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
> +; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
> +; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
> +; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT:    retq
> ;
> ; AVX2-LABEL: testv32i8u:
> ; AVX2:       # BB#0:
> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vmovd %ecx, %xmm2
> -; AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $3, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $7, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $9, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $11, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
> -; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
> -; AVX2-NEXT:    bsrl %ecx, %ecx
> -; AVX2-NEXT:    xorl $7, %ecx
> -; AVX2-NEXT:    vmovd %ecx, %xmm2
> -; AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
> -; AVX2-NEXT:    bsrl %eax, %eax
> -; AVX2-NEXT:    xorl $7, %eax
> -; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
> -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
> +; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
> +; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
> +; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
> +; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT:    retq
> ;
> ; AVX512VLCD-LABEL: testv32i8u:
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits



More information about the llvm-commits mailing list