[llvm] r269646 - [X86][SSSE3] Lower vector CTLZ with PSHUFB lookups

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon May 16 04:19:12 PDT 2016


Author: rksimon
Date: Mon May 16 06:19:11 2016
New Revision: 269646

URL: http://llvm.org/viewvc/llvm-project?rev=269646&view=rev
Log:
[X86][SSSE3] Lower vector CTLZ with PSHUFB lookups

This patch uses PSHUFB to lower vector CTLZ and avoid (slower) scalarizations.

The leading zero count of each 4-bit nibble of the vector is determined by using a PSHUFB lookup. Pairs of results are then repeatedly combined up to the original element width.

Differential Revision: http://reviews.llvm.org/D20016

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=269646&r1=269645&r2=269646&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon May 16 06:19:11 2016
@@ -864,6 +864,13 @@ X86TargetLowering::X86TargetLowering(con
     }
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
+    // ISD::CTLZ v4i32 - scalarization is faster.
+    // ISD::CTLZ v2i64 - scalarization is faster.
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
@@ -932,6 +939,8 @@ X86TargetLowering::X86TargetLowering(con
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+    bool HasInt256 = Subtarget.hasInt256();
+
     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
@@ -998,14 +1007,21 @@ X86TargetLowering::X86TargetLowering(con
       setOperationAction(ISD::CTTZ,            VT, Custom);
     }
 
+    // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
+    // as we end up splitting the 256-bit vectors.
+    for (auto VT : { MVT::v32i8, MVT::v16i16 })
+      setOperationAction(ISD::CTLZ,            VT, Custom);
+
+    if (HasInt256)
+      for (auto VT : { MVT::v8i32, MVT::v4i64 })
+        setOperationAction(ISD::CTLZ,          VT, Custom);
+
     if (Subtarget.hasAnyFMA()) {
       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
                        MVT::v2f64, MVT::v4f64 })
         setOperationAction(ISD::FMA, VT, Legal);
     }
 
-    bool HasInt256 = Subtarget.hasInt256();
-
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
@@ -18767,7 +18783,105 @@ static SDValue LowerVectorCTLZ_AVX512(SD
   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
 }
 
-static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, SDLoc DL,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT SVT = VT.getScalarType();
+  int NumElts = VT.getVectorNumElements();
+  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
+
+  // Per-nibble leading zero PSHUFB lookup table.
+  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
+
+  SmallVector<SDValue, 64> LUTVec;
+  for (int i = 0; i < NumBytes; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
+
+  // Begin by bitcasting the input to byte vector, then split those bytes
+  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+  // If the hi input nibble is zero then we add both results together, otherwise
+  // we just take the hi result (by masking the lo result to zero before the
+  // add).
+  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+
+  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
+  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+
+  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+  // Merge result back from vXi8 back to VT, working on the lo/hi halves
+  // of the current vector width in the same way we did for the nibbles.
+  // If the upper half of the input element is zero then add the halves'
+  // leading zero counts together, otherwise just use the upper half's.
+  // Double the width of the result until we are at target width.
+  while (CurrVT != VT) {
+    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+    int CurrNumElts = CurrVT.getVectorNumElements();
+    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+    // Check if the upper half of the input element is zero.
+    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+    HiZ = DAG.getBitcast(NextVT, HiZ);
+
+    // Move the upper/lower halves to the lower bits as we'll be extending to
+    // NextVT. Mask the lower result to zero if HiZ is true and add the results
+    // together.
+    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+    CurrVT = NextVT;
+  }
+
+  return Res;
+}
+
+static SDValue LowerVectorCTLZ(SDValue Op, SDLoc DL,
+                               const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDValue Op0 = Op.getOperand(0);
+
+  if (Subtarget.hasAVX512())
+    return LowerVectorCTLZ_AVX512(Op, DAG);
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, perform ctlz and concat the result.
+    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
+                       DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
+  }
+
+  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -18775,7 +18889,7 @@ static SDValue LowerCTLZ(SDValue Op, Sel
   unsigned Opc = Op.getOpcode();
 
   if (VT.isVector())
-    return LowerVectorCTLZ_AVX512(Op, DAG);
+    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -21304,7 +21418,7 @@ SDValue X86TargetLowering::LowerOperatio
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:
-  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll?rev=269646&r1=269645&r2=269646&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll Mon May 16 06:19:11 2016
@@ -706,145 +706,70 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pextrw $7, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %cx
-; SSSE3-NEXT:    movw $31, %ax
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    pextrw $3, %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    pextrw $5, %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    pextrw $1, %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $6, %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    pextrw $2, %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $4, %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movd %xmm0, %ecx
-; SSSE3-NEXT:    bsrw %cx, %cx
-; SSSE3-NEXT:    cmovew %ax, %cx
-; SSSE3-NEXT:    xorl $15, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrlw $4, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pshufb %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    paddb %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    psrlw $8, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    psrlw $8, %xmm1
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrw $1, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %cx
-; SSE41-NEXT:    movw $31, %ax
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    movd %xmm0, %edx
-; SSE41-NEXT:    bsrw %dx, %dx
-; SSE41-NEXT:    cmovew %ax, %dx
-; SSE41-NEXT:    xorl $15, %edx
-; SSE41-NEXT:    movd %edx, %xmm1
-; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $2, %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $3, %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $4, %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $5, %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $6, %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
-; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    cmovew %ax, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    pinsrw $7, %ecx, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %cx
-; AVX-NEXT:    movw $31, %ax
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vmovd %xmm0, %edx
-; AVX-NEXT:    bsrw %dx, %dx
-; AVX-NEXT:    cmovew %ax, %dx
-; AVX-NEXT:    xorl $15, %edx
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $6, %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    cmovew %ax, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv8i16:
@@ -865,47 +790,25 @@ define <8 x i16> @testv8i16(<8 x i16> %i
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %cx
-; X32-SSE-NEXT:    movw $31, %ax
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    movd %xmm0, %edx
-; X32-SSE-NEXT:    bsrw %dx, %dx
-; X32-SSE-NEXT:    cmovew %ax, %dx
-; X32-SSE-NEXT:    xorl $15, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $5, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $6, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    cmovew %ax, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    pinsrw $7, %ecx, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm1
+; X32-SSE-NEXT:    paddw %xmm0, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
@@ -1001,118 +904,70 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pextrw $7, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    pextrw $3, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    pextrw $5, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    pextrw $1, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $6, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    pextrw $2, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT:    pextrw $4, %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    movd %xmm0, %eax
-; SSSE3-NEXT:    bsrw %ax, %ax
-; SSSE3-NEXT:    xorl $15, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    psrlw $4, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pshufb %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    paddb %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    psrlw $8, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    psrlw $8, %xmm1
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrw $1, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    movd %xmm0, %ecx
-; SSE41-NEXT:    bsrw %cx, %cx
-; SSE41-NEXT:    xorl $15, %ecx
-; SSE41-NEXT:    movd %ecx, %xmm1
-; SSE41-NEXT:    pinsrw $1, %eax, %xmm1
-; SSE41-NEXT:    pextrw $2, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    pinsrw $2, %eax, %xmm1
-; SSE41-NEXT:    pextrw $3, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    pinsrw $3, %eax, %xmm1
-; SSE41-NEXT:    pextrw $4, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    pinsrw $4, %eax, %xmm1
-; SSE41-NEXT:    pextrw $5, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    pinsrw $5, %eax, %xmm1
-; SSE41-NEXT:    pextrw $6, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    pinsrw $6, %eax, %xmm1
-; SSE41-NEXT:    pextrw $7, %xmm0, %eax
-; SSE41-NEXT:    bsrw %ax, %ax
-; SSE41-NEXT:    xorl $15, %eax
-; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv8i16u:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vmovd %xmm0, %ecx
-; AVX-NEXT:    bsrw %cx, %cx
-; AVX-NEXT:    xorl $15, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $4, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX-NEXT:    bsrw %ax, %ax
-; AVX-NEXT:    xorl $15, %eax
-; AVX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv8i16u:
@@ -1133,38 +988,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    movd %xmm0, %ecx
-; X32-SSE-NEXT:    bsrw %cx, %cx
-; X32-SSE-NEXT:    xorl $15, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm1
-; X32-SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; X32-SSE-NEXT:    pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    pinsrw $2, %eax, %xmm1
-; X32-SSE-NEXT:    pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; X32-SSE-NEXT:    pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    pinsrw $4, %eax, %xmm1
-; X32-SSE-NEXT:    pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT:    pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    pinsrw $6, %eax, %xmm1
-; X32-SSE-NEXT:    pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT:    bsrw %ax, %ax
-; X32-SSE-NEXT:    xorl $15, %eax
-; X32-SSE-NEXT:    pinsrw $7, %eax, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlw $4, %xmm1
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT:    pand %xmm4, %xmm1
+; X32-SSE-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    psrlw $8, %xmm1
+; X32-SSE-NEXT:    paddw %xmm0, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
@@ -1384,278 +1226,53 @@ define <16 x i8> @testv16i8(<16 x i8> %i
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pushq %rbp
-; SSSE3-NEXT:    pushq %rbx
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    bsrl %eax, %ecx
-; SSSE3-NEXT:    movl $15, %eax
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    bsrl %ecx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %edx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
-; SSSE3-NEXT:    bsrl %ebp, %ebp
-; SSSE3-NEXT:    cmovel %eax, %ebp
-; SSSE3-NEXT:    xorl $7, %ebp
-; SSSE3-NEXT:    movd %ebp, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    bsrl %edi, %edi
-; SSSE3-NEXT:    cmovel %eax, %edi
-; SSSE3-NEXT:    xorl $7, %edi
-; SSSE3-NEXT:    movd %edi, %xmm1
-; SSSE3-NEXT:    bsrl %ecx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    bsrl %esi, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    bsrl %ecx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %ebx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    bsrl %edx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %r11d, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    bsrl %esi, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSSE3-NEXT:    bsrl %r9d, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    bsrl %r10d, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %r8d, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm4
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    bsrl %ecx, %ecx
-; SSSE3-NEXT:    cmovel %eax, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    popq %rbx
-; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrb $1, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %ecx
-; SSE41-NEXT:    movl $15, %eax
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pextrb $0, %xmm0, %edx
-; SSE41-NEXT:    bsrl %edx, %edx
-; SSE41-NEXT:    cmovel %eax, %edx
-; SSE41-NEXT:    xorl $7, %edx
-; SSE41-NEXT:    movd %edx, %xmm1
-; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $3, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $5, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $7, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $9, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $11, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $13, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $14, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
-; SSE41-NEXT:    pextrb $15, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    cmovel %eax, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    pinsrb $15, %ecx, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %ecx
-; AVX-NEXT:    movl $15, %eax
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX-NEXT:    bsrl %edx, %edx
-; AVX-NEXT:    cmovel %eax, %edx
-; AVX-NEXT:    xorl $7, %edx
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $5, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $7, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $9, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $11, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $13, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $15, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    cmovel %eax, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vpinsrb $15, %ecx, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8:
@@ -1668,87 +1285,19 @@ define <16 x i8> @testv16i8(<16 x i8> %i
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pextrb $1, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %ecx
-; X32-SSE-NEXT:    movl $15, %eax
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pextrb $0, %xmm0, %edx
-; X32-SSE-NEXT:    bsrl %edx, %edx
-; X32-SSE-NEXT:    cmovel %eax, %edx
-; X32-SSE-NEXT:    xorl $7, %edx
-; X32-SSE-NEXT:    movd %edx, %xmm1
-; X32-SSE-NEXT:    pinsrb $1, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $2, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $2, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $3, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $3, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $4, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $4, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $5, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $5, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $6, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $6, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $7, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $7, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $8, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $8, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $9, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $9, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $10, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $10, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $11, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $11, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $12, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $12, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $13, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $13, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $14, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $14, %ecx, %xmm1
-; X32-SSE-NEXT:    pextrb $15, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    cmovel %eax, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    pinsrb $15, %ecx, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
@@ -1930,225 +1479,53 @@ define <16 x i8> @testv16i8u(<16 x i8> %
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pushq %rbx
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    bsrl %eax, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT:    bsrl %esi, %esi
-; SSSE3-NEXT:    xorl $7, %esi
-; SSSE3-NEXT:    movd %esi, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %eax, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT:    bsrl %ebx, %ebx
-; SSSE3-NEXT:    xorl $7, %ebx
-; SSSE3-NEXT:    movd %ebx, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    bsrl %edx, %edx
-; SSSE3-NEXT:    xorl $7, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    bsrl %esi, %edx
-; SSSE3-NEXT:    xorl $7, %edx
-; SSSE3-NEXT:    movd %edx, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %ecx, %ecx
-; SSSE3-NEXT:    xorl $7, %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    bsrl %edx, %edx
-; SSSE3-NEXT:    xorl $7, %edx
-; SSSE3-NEXT:    movd %edx, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT:    bsrl %edi, %edx
-; SSSE3-NEXT:    xorl $7, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    bsrl %eax, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %r10d, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    bsrl %ecx, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT:    bsrl %r9d, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    bsrl %r11d, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    bsrl %r8d, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    bsrl %eax, %eax
-; SSSE3-NEXT:    xorl $7, %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrb $1, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE41-NEXT:    bsrl %ecx, %ecx
-; SSE41-NEXT:    xorl $7, %ecx
-; SSE41-NEXT:    movd %ecx, %xmm1
-; SSE41-NEXT:    pinsrb $1, %eax, %xmm1
-; SSE41-NEXT:    pextrb $2, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $2, %eax, %xmm1
-; SSE41-NEXT:    pextrb $3, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $3, %eax, %xmm1
-; SSE41-NEXT:    pextrb $4, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $4, %eax, %xmm1
-; SSE41-NEXT:    pextrb $5, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $5, %eax, %xmm1
-; SSE41-NEXT:    pextrb $6, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $6, %eax, %xmm1
-; SSE41-NEXT:    pextrb $7, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $7, %eax, %xmm1
-; SSE41-NEXT:    pextrb $8, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $8, %eax, %xmm1
-; SSE41-NEXT:    pextrb $9, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $9, %eax, %xmm1
-; SSE41-NEXT:    pextrb $10, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $10, %eax, %xmm1
-; SSE41-NEXT:    pextrb $11, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $11, %eax, %xmm1
-; SSE41-NEXT:    pextrb $12, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
-; SSE41-NEXT:    pextrb $13, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $13, %eax, %xmm1
-; SSE41-NEXT:    pextrb $14, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
-; SSE41-NEXT:    pextrb $15, %xmm0, %eax
-; SSE41-NEXT:    bsrl %eax, %eax
-; SSE41-NEXT:    xorl $7, %eax
-; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    pshufb %xmm3, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8u:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX-NEXT:    bsrl %ecx, %ecx
-; AVX-NEXT:    xorl $7, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX-NEXT:    bsrl %eax, %eax
-; AVX-NEXT:    xorl $7, %eax
-; AVX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i8u:
@@ -2161,70 +1538,19 @@ define <16 x i8> @testv16i8u(<16 x i8> %
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pextrb $1, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pextrb $0, %xmm0, %ecx
-; X32-SSE-NEXT:    bsrl %ecx, %ecx
-; X32-SSE-NEXT:    xorl $7, %ecx
-; X32-SSE-NEXT:    movd %ecx, %xmm1
-; X32-SSE-NEXT:    pinsrb $1, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $2, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $2, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $3, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $3, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $4, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $4, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $5, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $5, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $6, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $6, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $7, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $7, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $8, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $8, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $9, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $9, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $10, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $10, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $11, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $11, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $12, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $12, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $13, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $13, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $14, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $14, %eax, %xmm1
-; X32-SSE-NEXT:    pextrb $15, %xmm0, %eax
-; X32-SSE-NEXT:    bsrl %eax, %eax
-; X32-SSE-NEXT:    xorl $7, %eax
-; X32-SSE-NEXT:    pinsrb $15, %eax, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm4, %xmm2
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm2, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll?rev=269646&r1=269645&r2=269646&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll Mon May 16 06:19:11 2016
@@ -35,30 +35,32 @@ define <4 x i64> @testv4i64(<4 x i64> %i
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    movl $127, %ecx
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm3
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    cmoveq %rcx, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv4i64:
@@ -101,25 +103,32 @@ define <4 x i64> @testv4i64u(<4 x i64> %
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm3
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    bsrq %rax, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv4i64u:
@@ -181,44 +190,27 @@ define <8 x i32> @testv8i32(<8 x i32> %i
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %ecx
-; AVX2-NEXT:    movl $63, %eax
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm1, %edx
-; AVX2-NEXT:    bsrl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    bsrl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm3
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm3, %xmm0
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv8i32:
@@ -271,35 +263,27 @@ define <8 x i32> @testv8i32u(<8 x i32> %
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm3
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv8i32u:
@@ -320,149 +304,56 @@ define <16 x i16> @testv16i16(<16 x i16>
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %cx
-; AVX1-NEXT:    movw $31, %ax
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vmovd %xmm1, %edx
-; AVX1-NEXT:    bsrw %dx, %dx
-; AVX1-NEXT:    cmovew %ax, %dx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $2, %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $3, %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    bsrw %dx, %dx
-; AVX1-NEXT:    cmovew %ax, %dx
-; AVX1-NEXT:    vmovd %edx, %xmm3
-; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $2, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $3, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $6, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $7, %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    cmovew %ax, %cx
-; AVX1-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %cx
-; AVX2-NEXT:    movw $31, %ax
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vmovd %xmm1, %edx
-; AVX2-NEXT:    bsrw %dx, %dx
-; AVX2-NEXT:    cmovew %ax, %dx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $3, %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    bsrw %dx, %dx
-; AVX2-NEXT:    cmovew %ax, %dx
-; AVX2-NEXT:    vmovd %edx, %xmm3
-; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $2, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $3, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $6, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $7, %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    cmovew %ax, %cx
-; AVX2-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm0
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i16:
@@ -480,115 +371,56 @@ define <16 x i16> @testv16i16u(<16 x i16
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $2, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $4, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $5, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    bsrw %cx, %cx
-; AVX1-NEXT:    vmovd %ecx, %xmm3
-; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $4, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX1-NEXT:    bsrw %ax, %ax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $2, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $4, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $5, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    bsrw %cx, %cx
-; AVX2-NEXT:    vmovd %ecx, %xmm3
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX2-NEXT:    bsrw %ax, %ax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm0
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: testv16i16u:
@@ -606,335 +438,41 @@ define <32 x i8> @testv32i8(<32 x i8> %i
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %ecx
-; AVX1-NEXT:    movl $15, %eax
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX1-NEXT:    bsrl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    xorl $7, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX1-NEXT:    bsrl %edx, %edx
-; AVX1-NEXT:    cmovel %eax, %edx
-; AVX1-NEXT:    xorl $7, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    cmovel %eax, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm0
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %ecx
-; AVX2-NEXT:    movl $15, %eax
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    bsrl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    xorl $7, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
-; AVX2-NEXT:    bsrl %edx, %edx
-; AVX2-NEXT:    cmovel %eax, %edx
-; AVX2-NEXT:    xorl $7, %edx
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    cmovel %eax, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vpinsrb $15, %ecx, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv32i8:
@@ -974,269 +512,41 @@ define <32 x i8> @testv32i8u(<32 x i8> %
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    bsrl %ecx, %ecx
-; AVX1-NEXT:    xorl $7, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX1-NEXT:    bsrl %eax, %eax
-; AVX1-NEXT:    xorl $7, %eax
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    bsrl %ecx, %ecx
-; AVX2-NEXT:    xorl $7, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX2-NEXT:    bsrl %eax, %eax
-; AVX2-NEXT:    xorl $7, %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLCD-LABEL: testv32i8u:




More information about the llvm-commits mailing list