[llvm] r269646 - [X86][SSSE3] Lower vector CTLZ with PSHUFB lookups
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon May 16 04:19:12 PDT 2016
Author: rksimon
Date: Mon May 16 06:19:11 2016
New Revision: 269646
URL: http://llvm.org/viewvc/llvm-project?rev=269646&view=rev
Log:
[X86][SSSE3] Lower vector CTLZ with PSHUFB lookups
This patch uses PSHUFB to lower vector CTLZ and avoid (slower) scalarizations.
The leading zero count of each 4-bit nibble of the vector is determined by using a PSHUFB lookup. Pairs of results are then repeatedly combined up to the original element width.
Differential Revision: http://reviews.llvm.org/D20016
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=269646&r1=269645&r2=269646&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon May 16 06:19:11 2016
@@ -864,6 +864,13 @@ X86TargetLowering::X86TargetLowering(con
}
}
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ // ISD::CTLZ v4i32 - scalarization is faster.
+ // ISD::CTLZ v2i64 - scalarization is faster.
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
@@ -932,6 +939,8 @@ X86TargetLowering::X86TargetLowering(con
}
if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+ bool HasInt256 = Subtarget.hasInt256();
+
addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
@@ -998,14 +1007,21 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::CTTZ, VT, Custom);
}
+ // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
+ // as we end up splitting the 256-bit vectors.
+ for (auto VT : { MVT::v32i8, MVT::v16i16 })
+ setOperationAction(ISD::CTLZ, VT, Custom);
+
+ if (HasInt256)
+ for (auto VT : { MVT::v8i32, MVT::v4i64 })
+ setOperationAction(ISD::CTLZ, VT, Custom);
+
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::FMA, VT, Legal);
}
- bool HasInt256 = Subtarget.hasInt256();
-
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
@@ -18767,7 +18783,105 @@ static SDValue LowerVectorCTLZ_AVX512(SD
return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
}
-static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, SDLoc DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT SVT = VT.getScalarType();
+ int NumElts = VT.getVectorNumElements();
+ int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+ MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
+
+ // Per-nibble leading zero PSHUFB lookup table.
+ const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+ /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+ /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
+
+ SmallVector<SDValue, 64> LUTVec;
+ for (int i = 0; i < NumBytes; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
+
+ // Begin by bitcasting the input to byte vector, then split those bytes
+ // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+ // If the hi input nibble is zero then we add both results together, otherwise
+ // we just take the hi result (by masking the lo result to zero before the
+ // add).
+ SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+ SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+
+ SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
+ SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+ SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+ SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+ SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+
+ Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+ Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+ Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+ SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+ // Merge result back from vXi8 back to VT, working on the lo/hi halves
+ // of the current vector width in the same way we did for the nibbles.
+ // If the upper half of the input element is zero then add the halves'
+ // leading zero counts together, otherwise just use the upper half's.
+ // Double the width of the result until we are at target width.
+ while (CurrVT != VT) {
+ int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+ int CurrNumElts = CurrVT.getVectorNumElements();
+ MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+ MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+ SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+ // Check if the upper half of the input element is zero.
+ SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ HiZ = DAG.getBitcast(NextVT, HiZ);
+
+ // Move the upper/lower halves to the lower bits as we'll be extending to
+ // NextVT. Mask the lower result to zero if HiZ is true and add the results
+ // together.
+ SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+ SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+ SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+ R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+ Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+ CurrVT = NextVT;
+ }
+
+ return Res;
+}
+
+static SDValue LowerVectorCTLZ(SDValue Op, SDLoc DL,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDValue Op0 = Op.getOperand(0);
+
+ if (Subtarget.hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 128-bit vector, perform ctlz and concat the result.
+ SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
+ DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
+ }
+
+ assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+ return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
@@ -18775,7 +18889,7 @@ static SDValue LowerCTLZ(SDValue Op, Sel
unsigned Opc = Op.getOpcode();
if (VT.isVector())
- return LowerVectorCTLZ_AVX512(Op, DAG);
+ return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
Op = Op.getOperand(0);
if (VT == MVT::i8) {
@@ -21304,7 +21418,7 @@ SDValue X86TargetLowering::LowerOperatio
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ:
- case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, DAG);
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll?rev=269646&r1=269645&r2=269646&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll Mon May 16 06:19:11 2016
@@ -706,145 +706,70 @@ define <8 x i16> @testv8i16(<8 x i16> %i
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %cx
-; SSSE3-NEXT: movw $31, %ax
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: pextrw $3, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: pextrw $5, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: pextrw $1, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: pextrw $6, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: pextrw $2, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: pextrw $4, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movd %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: paddb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm1
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrw $1, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %cx
-; SSE41-NEXT: movw $31, %ax
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: movd %xmm0, %edx
-; SSE41-NEXT: bsrw %dx, %dx
-; SSE41-NEXT: cmovew %ax, %dx
-; SSE41-NEXT: xorl $15, %edx
-; SSE41-NEXT: movd %edx, %xmm1
-; SSE41-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE41-NEXT: pextrw $2, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $2, %ecx, %xmm1
-; SSE41-NEXT: pextrw $3, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $3, %ecx, %xmm1
-; SSE41-NEXT: pextrw $4, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $4, %ecx, %xmm1
-; SSE41-NEXT: pextrw $5, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $5, %ecx, %xmm1
-; SSE41-NEXT: pextrw $6, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $6, %ecx, %xmm1
-; SSE41-NEXT: pextrw $7, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $7, %ecx, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %cx
-; AVX-NEXT: movw $31, %ax
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vmovd %xmm0, %edx
-; AVX-NEXT: bsrw %dx, %dx
-; AVX-NEXT: cmovew %ax, %dx
-; AVX-NEXT: xorl $15, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16:
@@ -865,47 +790,25 @@ define <8 x i16> @testv8i16(<8 x i16> %i
;
; X32-SSE-LABEL: testv8i16:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %cx
-; X32-SSE-NEXT: movw $31, %ax
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: movd %xmm0, %edx
-; X32-SSE-NEXT: bsrw %dx, %dx
-; X32-SSE-NEXT: cmovew %ax, %dx
-; X32-SSE-NEXT: xorl $15, %edx
-; X32-SSE-NEXT: movd %edx, %xmm1
-; X32-SSE-NEXT: pinsrw $1, %ecx, %xmm1
-; X32-SSE-NEXT: pextrw $2, %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: pinsrw $2, %ecx, %xmm1
-; X32-SSE-NEXT: pextrw $3, %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: pinsrw $3, %ecx, %xmm1
-; X32-SSE-NEXT: pextrw $4, %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: pinsrw $4, %ecx, %xmm1
-; X32-SSE-NEXT: pextrw $5, %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: pinsrw $5, %ecx, %xmm1
-; X32-SSE-NEXT: pextrw $6, %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: pinsrw $6, %ecx, %xmm1
-; X32-SSE-NEXT: pextrw $7, %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: cmovew %ax, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: pinsrw $7, %ecx, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: paddb %xmm3, %xmm1
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: paddw %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
@@ -1001,118 +904,70 @@ define <8 x i16> @testv8i16u(<8 x i16> %
;
; SSSE3-LABEL: testv8i16u:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: pextrw $3, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: pextrw $5, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: pextrw $1, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movd %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: paddb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm1
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16u:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrw $1, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: movd %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrw $1, %eax, %xmm1
-; SSE41-NEXT: pextrw $2, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $2, %eax, %xmm1
-; SSE41-NEXT: pextrw $3, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $3, %eax, %xmm1
-; SSE41-NEXT: pextrw $4, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $4, %eax, %xmm1
-; SSE41-NEXT: pextrw $5, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $5, %eax, %xmm1
-; SSE41-NEXT: pextrw $6, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $6, %eax, %xmm1
-; SSE41-NEXT: pextrw $7, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $7, %eax, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16u:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16u:
@@ -1133,38 +988,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %
;
; X32-SSE-LABEL: testv8i16u:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pextrw $1, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: movd %xmm0, %ecx
-; X32-SSE-NEXT: bsrw %cx, %cx
-; X32-SSE-NEXT: xorl $15, %ecx
-; X32-SSE-NEXT: movd %ecx, %xmm1
-; X32-SSE-NEXT: pinsrw $1, %eax, %xmm1
-; X32-SSE-NEXT: pextrw $2, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: pinsrw $2, %eax, %xmm1
-; X32-SSE-NEXT: pextrw $3, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: pinsrw $3, %eax, %xmm1
-; X32-SSE-NEXT: pextrw $4, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: pinsrw $4, %eax, %xmm1
-; X32-SSE-NEXT: pextrw $5, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: pinsrw $5, %eax, %xmm1
-; X32-SSE-NEXT: pextrw $6, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: pinsrw $6, %eax, %xmm1
-; X32-SSE-NEXT: pextrw $7, %xmm0, %eax
-; X32-SSE-NEXT: bsrw %ax, %ax
-; X32-SSE-NEXT: xorl $15, %eax
-; X32-SSE-NEXT: pinsrw $7, %eax, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: paddb %xmm3, %xmm1
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: paddw %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
@@ -1384,278 +1226,53 @@ define <16 x i8> @testv16i8(<16 x i8> %i
;
; SSSE3-LABEL: testv16i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pushq %rbp
-; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %ecx
-; SSSE3-NEXT: movl $15, %eax
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %edx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
-; SSSE3-NEXT: bsrl %ebp, %ebp
-; SSSE3-NEXT: cmovel %eax, %ebp
-; SSSE3-NEXT: xorl $7, %ebp
-; SSSE3-NEXT: movd %ebp, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: bsrl %edi, %edi
-; SSSE3-NEXT: cmovel %eax, %edi
-; SSSE3-NEXT: xorl $7, %edi
-; SSSE3-NEXT: movd %edi, %xmm1
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: bsrl %esi, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %ebx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %edx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r11d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %esi, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSSE3-NEXT: bsrl %r9d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %r10d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r8d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: popq %rbx
-; SSSE3-NEXT: popq %rbp
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %ecx
-; SSE41-NEXT: movl $15, %eax
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pextrb $0, %xmm0, %edx
-; SSE41-NEXT: bsrl %edx, %edx
-; SSE41-NEXT: cmovel %eax, %edx
-; SSE41-NEXT: xorl $7, %edx
-; SSE41-NEXT: movd %edx, %xmm1
-; SSE41-NEXT: pinsrb $1, %ecx, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $2, %ecx, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $3, %ecx, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $4, %ecx, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $5, %ecx, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $6, %ecx, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $7, %ecx, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $8, %ecx, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $9, %ecx, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $10, %ecx, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $11, %ecx, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $12, %ecx, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $13, %ecx, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $14, %ecx, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $15, %ecx, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm3, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %ecx
-; AVX-NEXT: movl $15, %eax
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpextrb $0, %xmm0, %edx
-; AVX-NEXT: bsrl %edx, %edx
-; AVX-NEXT: cmovel %eax, %edx
-; AVX-NEXT: xorl $7, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: testv16i8:
@@ -1668,87 +1285,19 @@ define <16 x i8> @testv16i8(<16 x i8> %i
;
; X32-SSE-LABEL: testv16i8:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pextrb $1, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %ecx
-; X32-SSE-NEXT: movl $15, %eax
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pextrb $0, %xmm0, %edx
-; X32-SSE-NEXT: bsrl %edx, %edx
-; X32-SSE-NEXT: cmovel %eax, %edx
-; X32-SSE-NEXT: xorl $7, %edx
-; X32-SSE-NEXT: movd %edx, %xmm1
-; X32-SSE-NEXT: pinsrb $1, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $2, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $2, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $3, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $3, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $4, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $4, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $5, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $5, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $6, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $6, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $7, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $7, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $8, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $8, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $9, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $9, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $10, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $10, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $11, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $11, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $12, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $12, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $13, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $13, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $14, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $14, %ecx, %xmm1
-; X32-SSE-NEXT: pextrb $15, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: cmovel %eax, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: pinsrb $15, %ecx, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: pshufb %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
@@ -1930,225 +1479,53 @@ define <16 x i8> @testv16i8u(<16 x i8> %
;
; SSSE3-LABEL: testv16i8u:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: bsrl %esi, %esi
-; SSSE3-NEXT: xorl $7, %esi
-; SSSE3-NEXT: movd %esi, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT: bsrl %ebx, %ebx
-; SSSE3-NEXT: xorl $7, %ebx
-; SSSE3-NEXT: movd %ebx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: bsrl %edx, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: bsrl %esi, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: bsrl %edx, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: bsrl %edi, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r10d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: bsrl %ecx, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: bsrl %r9d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: bsrl %r11d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r8d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: popq %rbx
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8u:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pextrb $0, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrb $1, %eax, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $2, %eax, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $3, %eax, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $4, %eax, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $5, %eax, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $6, %eax, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $7, %eax, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $8, %eax, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $9, %eax, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $10, %eax, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $11, %eax, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $13, %eax, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $14, %eax, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $15, %eax, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm3, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8u:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $3, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $4, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $5, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $6, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $7, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $8, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $9, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $10, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $11, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $12, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $13, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $14, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $15, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: testv16i8u:
@@ -2161,70 +1538,19 @@ define <16 x i8> @testv16i8u(<16 x i8> %
;
; X32-SSE-LABEL: testv16i8u:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pextrb $1, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pextrb $0, %xmm0, %ecx
-; X32-SSE-NEXT: bsrl %ecx, %ecx
-; X32-SSE-NEXT: xorl $7, %ecx
-; X32-SSE-NEXT: movd %ecx, %xmm1
-; X32-SSE-NEXT: pinsrb $1, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $2, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $2, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $3, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $3, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $4, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $4, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $5, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $5, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $6, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $6, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $7, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $7, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $8, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $8, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $9, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $9, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $10, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $10, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $11, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $11, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $12, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $12, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $13, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $13, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $14, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $14, %eax, %xmm1
-; X32-SSE-NEXT: pextrb $15, %xmm0, %eax
-; X32-SSE-NEXT: bsrl %eax, %eax
-; X32-SSE-NEXT: xorl $7, %eax
-; X32-SSE-NEXT: pinsrb $15, %eax, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: pshufb %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll?rev=269646&r1=269645&r2=269646&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-256.ll Mon May 16 06:19:11 2016
@@ -35,30 +35,32 @@ define <4 x i64> @testv4i64(<4 x i64> %i
;
; AVX2-LABEL: testv4i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: movl $127, %ecx
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm3
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64:
@@ -101,25 +103,32 @@ define <4 x i64> @testv4i64u(<4 x i64> %
;
; AVX2-LABEL: testv4i64u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: vmovq %rax, %xmm3
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64u:
@@ -181,44 +190,27 @@ define <8 x i32> @testv8i32(<8 x i32> %i
;
; AVX2-LABEL: testv8i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %ecx
-; AVX2-NEXT: movl $63, %eax
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: vmovd %edx, %xmm3
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrd $2, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrd $3, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32:
@@ -271,35 +263,27 @@ define <8 x i32> @testv8i32u(<8 x i32> %
;
; AVX2-LABEL: testv8i32u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32u:
@@ -320,149 +304,56 @@ define <16 x i16> @testv16i16(<16 x i16>
; AVX1-LABEL: testv16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %cx
-; AVX1-NEXT: movw $31, %ax
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: bsrw %dx, %dx
-; AVX1-NEXT: cmovew %ax, %dx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: bsrw %dx, %dx
-; AVX1-NEXT: cmovew %ax, %dx
-; AVX1-NEXT: vmovd %edx, %xmm3
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %cx
-; AVX2-NEXT: movw $31, %ax
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: bsrw %dx, %dx
-; AVX2-NEXT: cmovew %ax, %dx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: bsrw %dx, %dx
-; AVX2-NEXT: cmovew %ax, %dx
-; AVX2-NEXT: vmovd %edx, %xmm3
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: testv16i16:
@@ -480,115 +371,56 @@ define <16 x i16> @testv16i16u(<16 x i16
; AVX1-LABEL: testv16i16u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $4, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $5, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $6, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX1-NEXT: vpextrw $7, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: testv16i16u:
@@ -606,335 +438,41 @@ define <32 x i8> @testv32i8(<32 x i8> %i
; AVX1-LABEL: testv32i8:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %ecx
-; AVX1-NEXT: movl $15, %eax
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpextrb $0, %xmm1, %edx
-; AVX1-NEXT: bsrl %edx, %edx
-; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $7, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpextrb $0, %xmm0, %edx
-; AVX1-NEXT: bsrl %edx, %edx
-; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $7, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %ecx
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm1, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $7, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm0, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $7, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv32i8:
@@ -974,269 +512,41 @@ define <32 x i8> @testv32i8u(<32 x i8> %
; AVX1-LABEL: testv32i8u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv32i8u:
More information about the llvm-commits
mailing list