[llvm] r269646 - [X86][SSSE3] Lower vector CTLZ with PSHUFB lookups
Steven Wu via llvm-commits
llvm-commits at lists.llvm.org
Mon May 16 09:03:17 PDT 2016
> On May 16, 2016, at 4:19 AM, Simon Pilgrim via llvm-commits <llvm-commits at lists.llvm.org> wrote:
>
> Author: rksimon
> Date: Mon May 16 06:19:11 2016
> New Revision: 269646
>
> URL: http://llvm.org/viewvc/llvm-project?rev=269646&view=rev
> Log:
> [X86][SSSE3] Lower vector CTLZ with PSHUFB lookups
>
> This patch uses PSHUFB to lower vector CTLZ and avoid (slower) scalarizations.
>
> The leading zero count of each 4-bit nibble of the vector is determined by using a PSHUFB lookup. Pairs of results are then repeatedly combined up to the original element width.
>
> Differential Revision: http://reviews.llvm.org/D20016
>
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
> llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=269646&r1=269645&r2=269646&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon May 16 06:19:11 2016
> @@ -864,6 +864,13 @@ X86TargetLowering::X86TargetLowering(con
> }
> }
>
> + if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
> + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
> + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
> + // ISD::CTLZ v4i32 - scalarization is faster.
> + // ISD::CTLZ v2i64 - scalarization is faster.
> + }
> +
> if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
> for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
> setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
> @@ -932,6 +939,8 @@ X86TargetLowering::X86TargetLowering(con
> }
>
> if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
> + bool HasInt256 = Subtarget.hasInt256();
> +
> addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
> addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
> addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
> @@ -998,14 +1007,21 @@ X86TargetLowering::X86TargetLowering(con
> setOperationAction(ISD::CTTZ, VT, Custom);
> }
>
> + // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
> + // as we end up splitting the 256-bit vectors.
> + for (auto VT : { MVT::v32i8, MVT::v16i16 })
> + setOperationAction(ISD::CTLZ, VT, Custom);
> +
> + if (HasInt256)
> + for (auto VT : { MVT::v8i32, MVT::v4i64 })
> + setOperationAction(ISD::CTLZ, VT, Custom);
> +
> if (Subtarget.hasAnyFMA()) {
> for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
> MVT::v2f64, MVT::v4f64 })
> setOperationAction(ISD::FMA, VT, Legal);
> }
>
> - bool HasInt256 = Subtarget.hasInt256();
> -
> for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
> setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
> setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
> @@ -18767,7 +18783,105 @@ static SDValue LowerVectorCTLZ_AVX512(SD
> return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
> }
>
> -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
> +// Lower CTLZ using a PSHUFB lookup table implementation.
> +static SDValue LowerVectorCTLZInRegLUT(SDValue Op, SDLoc DL,
> + const X86Subtarget &Subtarget,
> + SelectionDAG &DAG) {
> + MVT VT = Op.getSimpleValueType();
> + MVT SVT = VT.getScalarType();
SVT is not used? This is causing a compiler warning. Can you fix that?
Thanks
Steven
> + int NumElts = VT.getVectorNumElements();
> + int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
> + MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
> +
> + // Per-nibble leading zero PSHUFB lookup table.
> + const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
> + /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
> + /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
> + /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
> +
> + SmallVector<SDValue, 64> LUTVec;
> + for (int i = 0; i < NumBytes; ++i)
> + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
> + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
> +
> + // Begin by bitcasting the input to byte vector, then split those bytes
> + // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
> + // If the hi input nibble is zero then we add both results together, otherwise
> + // we just take the hi result (by masking the lo result to zero before the
> + // add).
> + SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
> + SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
> +
> + SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
> + SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
> + SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
> + SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
> + SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
> +
> + Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
> + Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
> + Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
> + SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
> +
> + // Merge result back from vXi8 back to VT, working on the lo/hi halves
> + // of the current vector width in the same way we did for the nibbles.
> + // If the upper half of the input element is zero then add the halves'
> + // leading zero counts together, otherwise just use the upper half's.
> + // Double the width of the result until we are at target width.
> + while (CurrVT != VT) {
> + int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
> + int CurrNumElts = CurrVT.getVectorNumElements();
> + MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
> + MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
> + SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
> +
> + // Check if the upper half of the input element is zero.
> + SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
> + DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
> + HiZ = DAG.getBitcast(NextVT, HiZ);
> +
> + // Move the upper/lower halves to the lower bits as we'll be extending to
> + // NextVT. Mask the lower result to zero if HiZ is true and add the results
> + // together.
> + SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
> + SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
> + SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
> + R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
> + Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
> + CurrVT = NextVT;
> + }
> +
> + return Res;
> +}
> +
> +static SDValue LowerVectorCTLZ(SDValue Op, SDLoc DL,
> + const X86Subtarget &Subtarget,
> + SelectionDAG &DAG) {
> + MVT VT = Op.getSimpleValueType();
> + SDValue Op0 = Op.getOperand(0);
> +
> + if (Subtarget.hasAVX512())
> + return LowerVectorCTLZ_AVX512(Op, DAG);
> +
> + // Decompose 256-bit ops into smaller 128-bit ops.
> + if (VT.is256BitVector() && !Subtarget.hasInt256()) {
> + unsigned NumElems = VT.getVectorNumElements();
> +
> + // Extract each 128-bit vector, perform ctlz and concat the result.
> + SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
> + SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
> +
> + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
> + DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
> + DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
> + }
> +
> + assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
> + return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
> +}
> +
> +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
> + SelectionDAG &DAG) {
> MVT VT = Op.getSimpleValueType();
> MVT OpVT = VT;
> unsigned NumBits = VT.getSizeInBits();
> @@ -18775,7 +18889,7 @@ static SDValue LowerCTLZ(SDValue Op, Sel
> unsigned Opc = Op.getOpcode();
>
> if (VT.isVector())
> - return LowerVectorCTLZ_AVX512(Op, DAG);
> + return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
>
> Op = Op.getOperand(0);
> if (VT == MVT::i8) {
> @@ -21304,7 +21418,7 @@ SDValue X86TargetLowering::LowerOperatio
> case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
> case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
> case ISD::CTLZ:
> - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, DAG);
> + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
> case ISD::CTTZ:
> case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
> case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
>
> Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll?rev=269646&r1=269645&r2=269646&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll Mon May 16 06:19:11 2016
> @@ -706,145 +706,70 @@ define <8 x i16> @testv8i16(<8 x i16> %i
> ;
> ; SSSE3-LABEL: testv8i16:
> ; SSSE3: # BB#0:
> -; SSSE3-NEXT: pextrw $7, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %cx
> -; SSSE3-NEXT: movw $31, %ax
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm1
> -; SSSE3-NEXT: pextrw $3, %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm2
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
> -; SSSE3-NEXT: pextrw $5, %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm3
> -; SSSE3-NEXT: pextrw $1, %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm1
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
> -; SSSE3-NEXT: pextrw $6, %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm2
> -; SSSE3-NEXT: pextrw $2, %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm3
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
> -; SSSE3-NEXT: pextrw $4, %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm2
> -; SSSE3-NEXT: movd %xmm0, %ecx
> -; SSSE3-NEXT: bsrw %cx, %cx
> -; SSSE3-NEXT: cmovew %ax, %cx
> -; SSSE3-NEXT: xorl $15, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: pand %xmm2, %xmm1
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT: movdqa %xmm3, %xmm4
> +; SSSE3-NEXT: pshufb %xmm1, %xmm4
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: psrlw $4, %xmm1
> +; SSSE3-NEXT: pand %xmm2, %xmm1
> +; SSSE3-NEXT: pxor %xmm2, %xmm2
> +; SSSE3-NEXT: pshufb %xmm1, %xmm3
> +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
> +; SSSE3-NEXT: pand %xmm4, %xmm1
> +; SSSE3-NEXT: paddb %xmm3, %xmm1
> +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
> +; SSSE3-NEXT: psrlw $8, %xmm0
> +; SSSE3-NEXT: pand %xmm1, %xmm0
> +; SSSE3-NEXT: psrlw $8, %xmm1
> +; SSSE3-NEXT: paddw %xmm0, %xmm1
> +; SSSE3-NEXT: movdqa %xmm1, %xmm0
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: testv8i16:
> ; SSE41: # BB#0:
> -; SSE41-NEXT: pextrw $1, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %cx
> -; SSE41-NEXT: movw $31, %ax
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: movd %xmm0, %edx
> -; SSE41-NEXT: bsrw %dx, %dx
> -; SSE41-NEXT: cmovew %ax, %dx
> -; SSE41-NEXT: xorl $15, %edx
> -; SSE41-NEXT: movd %edx, %xmm1
> -; SSE41-NEXT: pinsrw $1, %ecx, %xmm1
> -; SSE41-NEXT: pextrw $2, %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: pinsrw $2, %ecx, %xmm1
> -; SSE41-NEXT: pextrw $3, %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: pinsrw $3, %ecx, %xmm1
> -; SSE41-NEXT: pextrw $4, %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: pinsrw $4, %ecx, %xmm1
> -; SSE41-NEXT: pextrw $5, %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: pinsrw $5, %ecx, %xmm1
> -; SSE41-NEXT: pextrw $6, %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: pinsrw $6, %ecx, %xmm1
> -; SSE41-NEXT: pextrw $7, %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: cmovew %ax, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: pinsrw $7, %ecx, %xmm1
> +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT: movdqa %xmm0, %xmm1
> +; SSE41-NEXT: pand %xmm2, %xmm1
> +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT: movdqa %xmm3, %xmm4
> +; SSE41-NEXT: pshufb %xmm1, %xmm4
> +; SSE41-NEXT: movdqa %xmm0, %xmm1
> +; SSE41-NEXT: psrlw $4, %xmm1
> +; SSE41-NEXT: pand %xmm2, %xmm1
> +; SSE41-NEXT: pxor %xmm2, %xmm2
> +; SSE41-NEXT: pshufb %xmm1, %xmm3
> +; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
> +; SSE41-NEXT: pand %xmm4, %xmm1
> +; SSE41-NEXT: paddb %xmm3, %xmm1
> +; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
> +; SSE41-NEXT: psrlw $8, %xmm0
> +; SSE41-NEXT: pand %xmm1, %xmm0
> +; SSE41-NEXT: psrlw $8, %xmm1
> +; SSE41-NEXT: paddw %xmm0, %xmm1
> ; SSE41-NEXT: movdqa %xmm1, %xmm0
> ; SSE41-NEXT: retq
> ;
> ; AVX-LABEL: testv8i16:
> ; AVX: # BB#0:
> -; AVX-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %cx
> -; AVX-NEXT: movw $31, %ax
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vmovd %xmm0, %edx
> -; AVX-NEXT: bsrw %dx, %dx
> -; AVX-NEXT: cmovew %ax, %dx
> -; AVX-NEXT: xorl $15, %edx
> -; AVX-NEXT: vmovd %edx, %xmm1
> -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $2, %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $3, %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $4, %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $5, %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $6, %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $7, %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: cmovew %ax, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
> +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
> +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
> +; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
> +; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
> +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
> +; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
> +; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
> +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
> +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
> +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
> ; AVX-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv8i16:
> @@ -865,47 +790,25 @@ define <8 x i16> @testv8i16(<8 x i16> %i
> ;
> ; X32-SSE-LABEL: testv8i16:
> ; X32-SSE: # BB#0:
> -; X32-SSE-NEXT: pextrw $1, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %cx
> -; X32-SSE-NEXT: movw $31, %ax
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: movd %xmm0, %edx
> -; X32-SSE-NEXT: bsrw %dx, %dx
> -; X32-SSE-NEXT: cmovew %ax, %dx
> -; X32-SSE-NEXT: xorl $15, %edx
> -; X32-SSE-NEXT: movd %edx, %xmm1
> -; X32-SSE-NEXT: pinsrw $1, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrw $2, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: pinsrw $2, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrw $3, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: pinsrw $3, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrw $4, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: pinsrw $4, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrw $5, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: pinsrw $5, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrw $6, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: pinsrw $6, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrw $7, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: cmovew %ax, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: pinsrw $7, %ecx, %xmm1
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT: movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT: pand %xmm2, %xmm1
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT: movdqa %xmm3, %xmm4
> +; X32-SSE-NEXT: pshufb %xmm1, %xmm4
> +; X32-SSE-NEXT: movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT: psrlw $4, %xmm1
> +; X32-SSE-NEXT: pand %xmm2, %xmm1
> +; X32-SSE-NEXT: pxor %xmm2, %xmm2
> +; X32-SSE-NEXT: pshufb %xmm1, %xmm3
> +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
> +; X32-SSE-NEXT: pand %xmm4, %xmm1
> +; X32-SSE-NEXT: paddb %xmm3, %xmm1
> +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
> +; X32-SSE-NEXT: psrlw $8, %xmm0
> +; X32-SSE-NEXT: pand %xmm1, %xmm0
> +; X32-SSE-NEXT: psrlw $8, %xmm1
> +; X32-SSE-NEXT: paddw %xmm0, %xmm1
> ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT: retl
> %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
> @@ -1001,118 +904,70 @@ define <8 x i16> @testv8i16u(<8 x i16> %
> ;
> ; SSSE3-LABEL: testv8i16u:
> ; SSSE3: # BB#0:
> -; SSSE3-NEXT: pextrw $7, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm1
> -; SSSE3-NEXT: pextrw $3, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm2
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
> -; SSSE3-NEXT: pextrw $5, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm3
> -; SSSE3-NEXT: pextrw $1, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm1
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
> -; SSSE3-NEXT: pextrw $6, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm2
> -; SSSE3-NEXT: pextrw $2, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm3
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
> -; SSSE3-NEXT: pextrw $4, %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm2
> -; SSSE3-NEXT: movd %xmm0, %eax
> -; SSSE3-NEXT: bsrw %ax, %ax
> -; SSSE3-NEXT: xorl $15, %eax
> -; SSSE3-NEXT: movd %eax, %xmm0
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
> -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: pand %xmm2, %xmm1
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT: movdqa %xmm3, %xmm4
> +; SSSE3-NEXT: pshufb %xmm1, %xmm4
> +; SSSE3-NEXT: movdqa %xmm0, %xmm1
> +; SSSE3-NEXT: psrlw $4, %xmm1
> +; SSSE3-NEXT: pand %xmm2, %xmm1
> +; SSSE3-NEXT: pxor %xmm2, %xmm2
> +; SSSE3-NEXT: pshufb %xmm1, %xmm3
> +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
> +; SSSE3-NEXT: pand %xmm4, %xmm1
> +; SSSE3-NEXT: paddb %xmm3, %xmm1
> +; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
> +; SSSE3-NEXT: psrlw $8, %xmm0
> +; SSSE3-NEXT: pand %xmm1, %xmm0
> +; SSSE3-NEXT: psrlw $8, %xmm1
> +; SSSE3-NEXT: paddw %xmm0, %xmm1
> +; SSSE3-NEXT: movdqa %xmm1, %xmm0
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: testv8i16u:
> ; SSE41: # BB#0:
> -; SSE41-NEXT: pextrw $1, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: movd %xmm0, %ecx
> -; SSE41-NEXT: bsrw %cx, %cx
> -; SSE41-NEXT: xorl $15, %ecx
> -; SSE41-NEXT: movd %ecx, %xmm1
> -; SSE41-NEXT: pinsrw $1, %eax, %xmm1
> -; SSE41-NEXT: pextrw $2, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: pinsrw $2, %eax, %xmm1
> -; SSE41-NEXT: pextrw $3, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: pinsrw $3, %eax, %xmm1
> -; SSE41-NEXT: pextrw $4, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: pinsrw $4, %eax, %xmm1
> -; SSE41-NEXT: pextrw $5, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: pinsrw $5, %eax, %xmm1
> -; SSE41-NEXT: pextrw $6, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: pinsrw $6, %eax, %xmm1
> -; SSE41-NEXT: pextrw $7, %xmm0, %eax
> -; SSE41-NEXT: bsrw %ax, %ax
> -; SSE41-NEXT: xorl $15, %eax
> -; SSE41-NEXT: pinsrw $7, %eax, %xmm1
> +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT: movdqa %xmm0, %xmm1
> +; SSE41-NEXT: pand %xmm2, %xmm1
> +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT: movdqa %xmm3, %xmm4
> +; SSE41-NEXT: pshufb %xmm1, %xmm4
> +; SSE41-NEXT: movdqa %xmm0, %xmm1
> +; SSE41-NEXT: psrlw $4, %xmm1
> +; SSE41-NEXT: pand %xmm2, %xmm1
> +; SSE41-NEXT: pxor %xmm2, %xmm2
> +; SSE41-NEXT: pshufb %xmm1, %xmm3
> +; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
> +; SSE41-NEXT: pand %xmm4, %xmm1
> +; SSE41-NEXT: paddb %xmm3, %xmm1
> +; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
> +; SSE41-NEXT: psrlw $8, %xmm0
> +; SSE41-NEXT: pand %xmm1, %xmm0
> +; SSE41-NEXT: psrlw $8, %xmm1
> +; SSE41-NEXT: paddw %xmm0, %xmm1
> ; SSE41-NEXT: movdqa %xmm1, %xmm0
> ; SSE41-NEXT: retq
> ;
> ; AVX-LABEL: testv8i16u:
> ; AVX: # BB#0:
> -; AVX-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vmovd %xmm0, %ecx
> -; AVX-NEXT: bsrw %cx, %cx
> -; AVX-NEXT: xorl $15, %ecx
> -; AVX-NEXT: vmovd %ecx, %xmm1
> -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $2, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $3, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $4, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $5, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $6, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $7, %xmm0, %eax
> -; AVX-NEXT: bsrw %ax, %ax
> -; AVX-NEXT: xorl $15, %eax
> -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
> +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
> +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
> +; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
> +; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
> +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
> +; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
> +; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
> +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
> +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
> +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
> ; AVX-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv8i16u:
> @@ -1133,38 +988,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %
> ;
> ; X32-SSE-LABEL: testv8i16u:
> ; X32-SSE: # BB#0:
> -; X32-SSE-NEXT: pextrw $1, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: movd %xmm0, %ecx
> -; X32-SSE-NEXT: bsrw %cx, %cx
> -; X32-SSE-NEXT: xorl $15, %ecx
> -; X32-SSE-NEXT: movd %ecx, %xmm1
> -; X32-SSE-NEXT: pinsrw $1, %eax, %xmm1
> -; X32-SSE-NEXT: pextrw $2, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: pinsrw $2, %eax, %xmm1
> -; X32-SSE-NEXT: pextrw $3, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: pinsrw $3, %eax, %xmm1
> -; X32-SSE-NEXT: pextrw $4, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: pinsrw $4, %eax, %xmm1
> -; X32-SSE-NEXT: pextrw $5, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: pinsrw $5, %eax, %xmm1
> -; X32-SSE-NEXT: pextrw $6, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: pinsrw $6, %eax, %xmm1
> -; X32-SSE-NEXT: pextrw $7, %xmm0, %eax
> -; X32-SSE-NEXT: bsrw %ax, %ax
> -; X32-SSE-NEXT: xorl $15, %eax
> -; X32-SSE-NEXT: pinsrw $7, %eax, %xmm1
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT: movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT: pand %xmm2, %xmm1
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT: movdqa %xmm3, %xmm4
> +; X32-SSE-NEXT: pshufb %xmm1, %xmm4
> +; X32-SSE-NEXT: movdqa %xmm0, %xmm1
> +; X32-SSE-NEXT: psrlw $4, %xmm1
> +; X32-SSE-NEXT: pand %xmm2, %xmm1
> +; X32-SSE-NEXT: pxor %xmm2, %xmm2
> +; X32-SSE-NEXT: pshufb %xmm1, %xmm3
> +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
> +; X32-SSE-NEXT: pand %xmm4, %xmm1
> +; X32-SSE-NEXT: paddb %xmm3, %xmm1
> +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
> +; X32-SSE-NEXT: psrlw $8, %xmm0
> +; X32-SSE-NEXT: pand %xmm1, %xmm0
> +; X32-SSE-NEXT: psrlw $8, %xmm1
> +; X32-SSE-NEXT: paddw %xmm0, %xmm1
> ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT: retl
> %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
> @@ -1384,278 +1226,53 @@ define <16 x i8> @testv16i8(<16 x i8> %i
> ;
> ; SSSE3-LABEL: testv16i8:
> ; SSSE3: # BB#0:
> -; SSSE3-NEXT: pushq %rbp
> -; SSSE3-NEXT: pushq %rbx
> -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT: bsrl %eax, %ecx
> -; SSSE3-NEXT: movl $15, %eax
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT: bsrl %ecx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm1
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %edx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm2
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
> -; SSSE3-NEXT: bsrl %ebp, %ebp
> -; SSSE3-NEXT: cmovel %eax, %ebp
> -; SSSE3-NEXT: xorl $7, %ebp
> -; SSSE3-NEXT: movd %ebp, %xmm0
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
> -; SSSE3-NEXT: bsrl %edi, %edi
> -; SSSE3-NEXT: cmovel %eax, %edi
> -; SSSE3-NEXT: xorl $7, %edi
> -; SSSE3-NEXT: movd %edi, %xmm1
> -; SSSE3-NEXT: bsrl %ecx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm2
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
> -; SSSE3-NEXT: bsrl %esi, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm3
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT: bsrl %ecx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm1
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %ebx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: bsrl %edx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm3
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %r11d, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: bsrl %esi, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm2
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
> -; SSSE3-NEXT: bsrl %r9d, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: bsrl %r10d, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm3
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %r8d, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm4
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT: bsrl %ecx, %ecx
> -; SSSE3-NEXT: cmovel %eax, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
> -; SSSE3-NEXT: popq %rbx
> -; SSSE3-NEXT: popq %rbp
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT: movdqa %xmm0, %xmm3
> +; SSSE3-NEXT: pand %xmm2, %xmm3
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT: movdqa %xmm1, %xmm4
> +; SSSE3-NEXT: pshufb %xmm3, %xmm4
> +; SSSE3-NEXT: psrlw $4, %xmm0
> +; SSSE3-NEXT: pand %xmm2, %xmm0
> +; SSSE3-NEXT: pxor %xmm2, %xmm2
> +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
> +; SSSE3-NEXT: pand %xmm4, %xmm2
> +; SSSE3-NEXT: pshufb %xmm0, %xmm1
> +; SSSE3-NEXT: paddb %xmm2, %xmm1
> +; SSSE3-NEXT: movdqa %xmm1, %xmm0
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: testv16i8:
> ; SSE41: # BB#0:
> -; SSE41-NEXT: pextrb $1, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %ecx
> -; SSE41-NEXT: movl $15, %eax
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pextrb $0, %xmm0, %edx
> -; SSE41-NEXT: bsrl %edx, %edx
> -; SSE41-NEXT: cmovel %eax, %edx
> -; SSE41-NEXT: xorl $7, %edx
> -; SSE41-NEXT: movd %edx, %xmm1
> -; SSE41-NEXT: pinsrb $1, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $2, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $2, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $3, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $3, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $4, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $4, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $5, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $5, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $6, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $6, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $7, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $7, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $8, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $8, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $9, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $9, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $10, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $10, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $11, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $11, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $12, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $12, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $13, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $13, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $14, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $14, %ecx, %xmm1
> -; SSE41-NEXT: pextrb $15, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: cmovel %eax, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: pinsrb $15, %ecx, %xmm1
> +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT: movdqa %xmm0, %xmm3
> +; SSE41-NEXT: pand %xmm2, %xmm3
> +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT: movdqa %xmm1, %xmm4
> +; SSE41-NEXT: pshufb %xmm3, %xmm4
> +; SSE41-NEXT: psrlw $4, %xmm0
> +; SSE41-NEXT: pand %xmm2, %xmm0
> +; SSE41-NEXT: pxor %xmm2, %xmm2
> +; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
> +; SSE41-NEXT: pand %xmm4, %xmm2
> +; SSE41-NEXT: pshufb %xmm0, %xmm1
> +; SSE41-NEXT: paddb %xmm2, %xmm1
> ; SSE41-NEXT: movdqa %xmm1, %xmm0
> ; SSE41-NEXT: retq
> ;
> ; AVX-LABEL: testv16i8:
> ; AVX: # BB#0:
> -; AVX-NEXT: vpextrb $1, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %ecx
> -; AVX-NEXT: movl $15, %eax
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpextrb $0, %xmm0, %edx
> -; AVX-NEXT: bsrl %edx, %edx
> -; AVX-NEXT: cmovel %eax, %edx
> -; AVX-NEXT: xorl $7, %edx
> -; AVX-NEXT: vmovd %edx, %xmm1
> -; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $2, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $3, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $4, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $5, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $6, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $7, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $8, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $9, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $10, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $11, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $12, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $13, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $14, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $15, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: cmovel %eax, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm0
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
> +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
> +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
> +; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
> +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
> +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
> ; AVX-NEXT: retq
> ;
> ; AVX512-LABEL: testv16i8:
> @@ -1668,87 +1285,19 @@ define <16 x i8> @testv16i8(<16 x i8> %i
> ;
> ; X32-SSE-LABEL: testv16i8:
> ; X32-SSE: # BB#0:
> -; X32-SSE-NEXT: pextrb $1, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %ecx
> -; X32-SSE-NEXT: movl $15, %eax
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pextrb $0, %xmm0, %edx
> -; X32-SSE-NEXT: bsrl %edx, %edx
> -; X32-SSE-NEXT: cmovel %eax, %edx
> -; X32-SSE-NEXT: xorl $7, %edx
> -; X32-SSE-NEXT: movd %edx, %xmm1
> -; X32-SSE-NEXT: pinsrb $1, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $2, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $2, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $3, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $3, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $4, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $4, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $5, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $5, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $6, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $6, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $7, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $7, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $8, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $8, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $9, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $9, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $10, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $10, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $11, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $11, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $12, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $12, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $13, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $13, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $14, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $14, %ecx, %xmm1
> -; X32-SSE-NEXT: pextrb $15, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: cmovel %eax, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: pinsrb $15, %ecx, %xmm1
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT: movdqa %xmm0, %xmm3
> +; X32-SSE-NEXT: pand %xmm2, %xmm3
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT: movdqa %xmm1, %xmm4
> +; X32-SSE-NEXT: pshufb %xmm3, %xmm4
> +; X32-SSE-NEXT: psrlw $4, %xmm0
> +; X32-SSE-NEXT: pand %xmm2, %xmm0
> +; X32-SSE-NEXT: pxor %xmm2, %xmm2
> +; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
> +; X32-SSE-NEXT: pand %xmm4, %xmm2
> +; X32-SSE-NEXT: pshufb %xmm0, %xmm1
> +; X32-SSE-NEXT: paddb %xmm2, %xmm1
> ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT: retl
> %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
> @@ -1930,225 +1479,53 @@ define <16 x i8> @testv16i8u(<16 x i8> %
> ;
> ; SSSE3-LABEL: testv16i8u:
> ; SSSE3: # BB#0:
> -; SSSE3-NEXT: pushq %rbx
> -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT: bsrl %eax, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm0
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT: bsrl %esi, %esi
> -; SSSE3-NEXT: xorl $7, %esi
> -; SSSE3-NEXT: movd %esi, %xmm1
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %eax, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm0
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
> -; SSSE3-NEXT: bsrl %ebx, %ebx
> -; SSSE3-NEXT: xorl $7, %ebx
> -; SSSE3-NEXT: movd %ebx, %xmm2
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
> -; SSSE3-NEXT: bsrl %edx, %edx
> -; SSSE3-NEXT: xorl $7, %edx
> -; SSSE3-NEXT: movd %edx, %xmm0
> -; SSSE3-NEXT: bsrl %esi, %edx
> -; SSSE3-NEXT: xorl $7, %edx
> -; SSSE3-NEXT: movd %edx, %xmm3
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %ecx, %ecx
> -; SSSE3-NEXT: xorl $7, %ecx
> -; SSSE3-NEXT: movd %ecx, %xmm0
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
> -; SSSE3-NEXT: bsrl %edx, %edx
> -; SSSE3-NEXT: xorl $7, %edx
> -; SSSE3-NEXT: movd %edx, %xmm1
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
> -; SSSE3-NEXT: bsrl %edi, %edx
> -; SSSE3-NEXT: xorl $7, %edx
> -; SSSE3-NEXT: movd %edx, %xmm0
> -; SSSE3-NEXT: bsrl %eax, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm2
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %r10d, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm0
> -; SSSE3-NEXT: bsrl %ecx, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm3
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
> -; SSSE3-NEXT: bsrl %r9d, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm0
> -; SSSE3-NEXT: bsrl %r11d, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm2
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
> -; SSSE3-NEXT: bsrl %r8d, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm4
> -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
> -; SSSE3-NEXT: bsrl %eax, %eax
> -; SSSE3-NEXT: xorl $7, %eax
> -; SSSE3-NEXT: movd %eax, %xmm0
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
> -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
> -; SSSE3-NEXT: popq %rbx
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSSE3-NEXT: movdqa %xmm0, %xmm3
> +; SSSE3-NEXT: pand %xmm2, %xmm3
> +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSSE3-NEXT: movdqa %xmm1, %xmm4
> +; SSSE3-NEXT: pshufb %xmm3, %xmm4
> +; SSSE3-NEXT: psrlw $4, %xmm0
> +; SSSE3-NEXT: pand %xmm2, %xmm0
> +; SSSE3-NEXT: pxor %xmm2, %xmm2
> +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
> +; SSSE3-NEXT: pand %xmm4, %xmm2
> +; SSSE3-NEXT: pshufb %xmm0, %xmm1
> +; SSSE3-NEXT: paddb %xmm2, %xmm1
> +; SSSE3-NEXT: movdqa %xmm1, %xmm0
> ; SSSE3-NEXT: retq
> ;
> ; SSE41-LABEL: testv16i8u:
> ; SSE41: # BB#0:
> -; SSE41-NEXT: pextrb $1, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pextrb $0, %xmm0, %ecx
> -; SSE41-NEXT: bsrl %ecx, %ecx
> -; SSE41-NEXT: xorl $7, %ecx
> -; SSE41-NEXT: movd %ecx, %xmm1
> -; SSE41-NEXT: pinsrb $1, %eax, %xmm1
> -; SSE41-NEXT: pextrb $2, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $2, %eax, %xmm1
> -; SSE41-NEXT: pextrb $3, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $3, %eax, %xmm1
> -; SSE41-NEXT: pextrb $4, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $4, %eax, %xmm1
> -; SSE41-NEXT: pextrb $5, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $5, %eax, %xmm1
> -; SSE41-NEXT: pextrb $6, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $6, %eax, %xmm1
> -; SSE41-NEXT: pextrb $7, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $7, %eax, %xmm1
> -; SSE41-NEXT: pextrb $8, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $8, %eax, %xmm1
> -; SSE41-NEXT: pextrb $9, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $9, %eax, %xmm1
> -; SSE41-NEXT: pextrb $10, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $10, %eax, %xmm1
> -; SSE41-NEXT: pextrb $11, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $11, %eax, %xmm1
> -; SSE41-NEXT: pextrb $12, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $12, %eax, %xmm1
> -; SSE41-NEXT: pextrb $13, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $13, %eax, %xmm1
> -; SSE41-NEXT: pextrb $14, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $14, %eax, %xmm1
> -; SSE41-NEXT: pextrb $15, %xmm0, %eax
> -; SSE41-NEXT: bsrl %eax, %eax
> -; SSE41-NEXT: xorl $7, %eax
> -; SSE41-NEXT: pinsrb $15, %eax, %xmm1
> +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; SSE41-NEXT: movdqa %xmm0, %xmm3
> +; SSE41-NEXT: pand %xmm2, %xmm3
> +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; SSE41-NEXT: movdqa %xmm1, %xmm4
> +; SSE41-NEXT: pshufb %xmm3, %xmm4
> +; SSE41-NEXT: psrlw $4, %xmm0
> +; SSE41-NEXT: pand %xmm2, %xmm0
> +; SSE41-NEXT: pxor %xmm2, %xmm2
> +; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
> +; SSE41-NEXT: pand %xmm4, %xmm2
> +; SSE41-NEXT: pshufb %xmm0, %xmm1
> +; SSE41-NEXT: paddb %xmm2, %xmm1
> ; SSE41-NEXT: movdqa %xmm1, %xmm0
> ; SSE41-NEXT: retq
> ;
> ; AVX-LABEL: testv16i8u:
> ; AVX: # BB#0:
> -; AVX-NEXT: vpextrb $1, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpextrb $0, %xmm0, %ecx
> -; AVX-NEXT: bsrl %ecx, %ecx
> -; AVX-NEXT: xorl $7, %ecx
> -; AVX-NEXT: vmovd %ecx, %xmm1
> -; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $2, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $3, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $4, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $5, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $6, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $7, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $8, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $9, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $10, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $11, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $12, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $13, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $14, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrb $15, %xmm0, %eax
> -; AVX-NEXT: bsrl %eax, %eax
> -; AVX-NEXT: xorl $7, %eax
> -; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
> +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
> +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
> +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
> +; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
> +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
> +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
> ; AVX-NEXT: retq
> ;
> ; AVX512-LABEL: testv16i8u:
> @@ -2161,70 +1538,19 @@ define <16 x i8> @testv16i8u(<16 x i8> %
> ;
> ; X32-SSE-LABEL: testv16i8u:
> ; X32-SSE: # BB#0:
> -; X32-SSE-NEXT: pextrb $1, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pextrb $0, %xmm0, %ecx
> -; X32-SSE-NEXT: bsrl %ecx, %ecx
> -; X32-SSE-NEXT: xorl $7, %ecx
> -; X32-SSE-NEXT: movd %ecx, %xmm1
> -; X32-SSE-NEXT: pinsrb $1, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $2, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $2, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $3, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $3, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $4, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $4, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $5, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $5, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $6, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $6, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $7, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $7, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $8, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $8, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $9, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $9, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $10, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $10, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $11, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $11, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $12, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $12, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $13, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $13, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $14, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $14, %eax, %xmm1
> -; X32-SSE-NEXT: pextrb $15, %xmm0, %eax
> -; X32-SSE-NEXT: bsrl %eax, %eax
> -; X32-SSE-NEXT: xorl $7, %eax
> -; X32-SSE-NEXT: pinsrb $15, %eax, %xmm1
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; X32-SSE-NEXT: movdqa %xmm0, %xmm3
> +; X32-SSE-NEXT: pand %xmm2, %xmm3
> +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; X32-SSE-NEXT: movdqa %xmm1, %xmm4
> +; X32-SSE-NEXT: pshufb %xmm3, %xmm4
> +; X32-SSE-NEXT: psrlw $4, %xmm0
> +; X32-SSE-NEXT: pand %xmm2, %xmm0
> +; X32-SSE-NEXT: pxor %xmm2, %xmm2
> +; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
> +; X32-SSE-NEXT: pand %xmm4, %xmm2
> +; X32-SSE-NEXT: pshufb %xmm0, %xmm1
> +; X32-SSE-NEXT: paddb %xmm2, %xmm1
> ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
> ; X32-SSE-NEXT: retl
> %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
>
> Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll?rev=269646&r1=269645&r2=269646&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll Mon May 16 06:19:11 2016
> @@ -35,30 +35,32 @@ define <4 x i64> @testv4i64(<4 x i64> %i
> ;
> ; AVX2-LABEL: testv4i64:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrq $1, %xmm1, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: movl $127, %ecx
> -; AVX2-NEXT: cmoveq %rcx, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm2
> -; AVX2-NEXT: vmovq %xmm1, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: cmoveq %rcx, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm1
> -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
> -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
> -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrq $1, %xmm0, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: cmoveq %rcx, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm3
> -; AVX2-NEXT: vmovq %xmm0, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: cmoveq %rcx, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm0
> -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
> -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
> +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv4i64:
> @@ -101,25 +103,32 @@ define <4 x i64> @testv4i64u(<4 x i64> %
> ;
> ; AVX2-LABEL: testv4i64u:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrq $1, %xmm1, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm2
> -; AVX2-NEXT: vmovq %xmm1, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm1
> -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
> -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
> -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrq $1, %xmm0, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm3
> -; AVX2-NEXT: vmovq %xmm0, %rax
> -; AVX2-NEXT: bsrq %rax, %rax
> -; AVX2-NEXT: vmovq %rax, %xmm0
> -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
> -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
> +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv4i64u:
> @@ -181,44 +190,27 @@ define <8 x i32> @testv8i32(<8 x i32> %i
> ;
> ; AVX2-LABEL: testv8i32:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrd $1, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %ecx
> -; AVX2-NEXT: movl $63, %eax
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: vmovd %xmm1, %edx
> -; AVX2-NEXT: bsrl %edx, %edx
> -; AVX2-NEXT: cmovel %eax, %edx
> -; AVX2-NEXT: vmovd %edx, %xmm2
> -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1
> -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
> -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrd $1, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: vmovd %xmm0, %edx
> -; AVX2-NEXT: bsrl %edx, %edx
> -; AVX2-NEXT: cmovel %eax, %edx
> -; AVX2-NEXT: vmovd %edx, %xmm3
> -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrd $2, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrd $3, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0
> -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv8i32:
> @@ -271,35 +263,27 @@ define <8 x i32> @testv8i32u(<8 x i32> %
> ;
> ; AVX2-LABEL: testv8i32u:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrd $1, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: vmovd %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: vmovd %ecx, %xmm2
> -; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrd $2, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrd $3, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
> -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
> -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrd $1, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: vmovd %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: vmovd %ecx, %xmm3
> -; AVX2-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrd $2, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrd $3, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
> -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
> +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
> +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv8i32u:
> @@ -320,149 +304,56 @@ define <16 x i16> @testv16i16(<16 x i16>
> ; AVX1-LABEL: testv16i16:
> ; AVX1: # BB#0:
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT: vpextrw $1, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %cx
> -; AVX1-NEXT: movw $31, %ax
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vmovd %xmm1, %edx
> -; AVX1-NEXT: bsrw %dx, %dx
> -; AVX1-NEXT: cmovew %ax, %dx
> -; AVX1-NEXT: vmovd %edx, %xmm2
> -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $2, %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $3, %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $4, %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $5, %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $6, %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $7, %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
> -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX1-NEXT: vpextrw $1, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vmovd %xmm0, %edx
> -; AVX1-NEXT: bsrw %dx, %dx
> -; AVX1-NEXT: cmovew %ax, %dx
> -; AVX1-NEXT: vmovd %edx, %xmm3
> -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $2, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $3, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $4, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $5, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $6, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $7, %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: cmovew %ax, %cx
> -; AVX1-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm0
> -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
> +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
> +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
> +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
> +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
> +; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
> +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
> +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
> +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
> +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
> +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
> +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
> +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
> +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
> +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
> +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: retq
> ;
> ; AVX2-LABEL: testv16i16:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrw $1, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %cx
> -; AVX2-NEXT: movw $31, %ax
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vmovd %xmm1, %edx
> -; AVX2-NEXT: bsrw %dx, %dx
> -; AVX2-NEXT: cmovew %ax, %dx
> -; AVX2-NEXT: vmovd %edx, %xmm2
> -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $2, %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $3, %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $4, %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $5, %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $6, %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $7, %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
> -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrw $1, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vmovd %xmm0, %edx
> -; AVX2-NEXT: bsrw %dx, %dx
> -; AVX2-NEXT: cmovew %ax, %dx
> -; AVX2-NEXT: vmovd %edx, %xmm3
> -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $2, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $3, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $4, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $5, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $6, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $7, %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: cmovew %ax, %cx
> -; AVX2-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm0
> -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512-LABEL: testv16i16:
> @@ -480,115 +371,56 @@ define <16 x i16> @testv16i16u(<16 x i16
> ; AVX1-LABEL: testv16i16u:
> ; AVX1: # BB#0:
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT: vpextrw $1, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vmovd %xmm1, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: vmovd %ecx, %xmm2
> -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $2, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $3, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $4, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $5, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $6, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrw $7, %xmm1, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
> -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX1-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vmovd %xmm0, %ecx
> -; AVX1-NEXT: bsrw %cx, %cx
> -; AVX1-NEXT: vmovd %ecx, %xmm3
> -; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $2, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $3, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $4, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $5, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $6, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
> -; AVX1-NEXT: vpextrw $7, %xmm0, %eax
> -; AVX1-NEXT: bsrw %ax, %ax
> -; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0
> -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
> +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
> +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
> +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
> +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
> +; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
> +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
> +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
> +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
> +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
> +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
> +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
> +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
> +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
> +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
> +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
> +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: retq
> ;
> ; AVX2-LABEL: testv16i16u:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrw $1, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vmovd %xmm1, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: vmovd %ecx, %xmm2
> -; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $2, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $3, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $4, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $5, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $6, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrw $7, %xmm1, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
> -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
> -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vmovd %xmm0, %ecx
> -; AVX2-NEXT: bsrw %cx, %cx
> -; AVX2-NEXT: vmovd %ecx, %xmm3
> -; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $2, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $3, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $4, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $5, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $6, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
> -; AVX2-NEXT: vpextrw $7, %xmm0, %eax
> -; AVX2-NEXT: bsrw %ax, %ax
> -; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0
> -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
> +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
> +; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
> +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
> +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
> +; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
> +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
> +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
> +; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512-LABEL: testv16i16u:
> @@ -606,335 +438,41 @@ define <32 x i8> @testv32i8(<32 x i8> %i
> ; AVX1-LABEL: testv32i8:
> ; AVX1: # BB#0:
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT: vpextrb $1, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %ecx
> -; AVX1-NEXT: movl $15, %eax
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpextrb $0, %xmm1, %edx
> -; AVX1-NEXT: bsrl %edx, %edx
> -; AVX1-NEXT: cmovel %eax, %edx
> -; AVX1-NEXT: xorl $7, %edx
> -; AVX1-NEXT: vmovd %edx, %xmm2
> -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $2, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $3, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $4, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $5, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $6, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $7, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $8, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $9, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $10, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $11, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $12, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $13, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $14, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $15, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
> -; AVX1-NEXT: vpextrb $1, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpextrb $0, %xmm0, %edx
> -; AVX1-NEXT: bsrl %edx, %edx
> -; AVX1-NEXT: cmovel %eax, %edx
> -; AVX1-NEXT: xorl $7, %edx
> -; AVX1-NEXT: vmovd %edx, %xmm2
> -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $2, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $3, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $4, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $5, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $6, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $7, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $9, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $10, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $11, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $12, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $13, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $14, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $15, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: cmovel %eax, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
> +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
> +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
> +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
> +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
> +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
> +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
> +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
> +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
> +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
> +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: retq
> ;
> ; AVX2-LABEL: testv32i8:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrb $1, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %ecx
> -; AVX2-NEXT: movl $15, %eax
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpextrb $0, %xmm1, %edx
> -; AVX2-NEXT: bsrl %edx, %edx
> -; AVX2-NEXT: cmovel %eax, %edx
> -; AVX2-NEXT: xorl $7, %edx
> -; AVX2-NEXT: vmovd %edx, %xmm2
> -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $2, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $4, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $5, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $6, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $7, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $10, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $12, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
> -; AVX2-NEXT: vpextrb $1, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpextrb $0, %xmm0, %edx
> -; AVX2-NEXT: bsrl %edx, %edx
> -; AVX2-NEXT: cmovel %eax, %edx
> -; AVX2-NEXT: xorl $7, %edx
> -; AVX2-NEXT: vmovd %edx, %xmm2
> -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $2, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $3, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $4, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $5, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $6, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $7, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $10, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $12, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $13, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $14, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: cmovel %eax, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
> +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
> +; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv32i8:
> @@ -974,269 +512,41 @@ define <32 x i8> @testv32i8u(<32 x i8> %
> ; AVX1-LABEL: testv32i8u:
> ; AVX1: # BB#0:
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
> -; AVX1-NEXT: vpextrb $1, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vmovd %ecx, %xmm2
> -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $2, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $3, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $4, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $5, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $6, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $7, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $8, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $9, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $10, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $11, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $12, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $13, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $14, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $15, %xmm1, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
> -; AVX1-NEXT: vpextrb $1, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpextrb $0, %xmm0, %ecx
> -; AVX1-NEXT: bsrl %ecx, %ecx
> -; AVX1-NEXT: xorl $7, %ecx
> -; AVX1-NEXT: vmovd %ecx, %xmm2
> -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $2, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $3, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $4, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $5, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $6, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $7, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $8, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $9, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $10, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $11, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $12, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $13, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $14, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX1-NEXT: vpextrb $15, %xmm0, %eax
> -; AVX1-NEXT: bsrl %eax, %eax
> -; AVX1-NEXT: xorl $7, %eax
> -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
> +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
> +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
> +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
> +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
> +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
> +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
> +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
> +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
> +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
> +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
> +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
> +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
> +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
> +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: retq
> ;
> ; AVX2-LABEL: testv32i8u:
> ; AVX2: # BB#0:
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
> -; AVX2-NEXT: vpextrb $1, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vmovd %ecx, %xmm2
> -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $2, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $3, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $4, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $5, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $6, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $7, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $8, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $9, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $10, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $11, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $12, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $13, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $14, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $15, %xmm1, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
> -; AVX2-NEXT: vpextrb $1, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpextrb $0, %xmm0, %ecx
> -; AVX2-NEXT: bsrl %ecx, %ecx
> -; AVX2-NEXT: xorl $7, %ecx
> -; AVX2-NEXT: vmovd %ecx, %xmm2
> -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $2, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $3, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $4, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $5, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $6, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $7, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $8, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $9, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $10, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $11, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $12, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $13, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $14, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
> -; AVX2-NEXT: vpextrb $15, %xmm0, %eax
> -; AVX2-NEXT: bsrl %eax, %eax
> -; AVX2-NEXT: xorl $7, %eax
> -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
> -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
> +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
> +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
> +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
> +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
> +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
> +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
> +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
> +; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> ;
> ; AVX512VLCD-LABEL: testv32i8u:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list