[llvm] r326119 - [X86] Add a custom legalization for (i16 (bitcast v16i1)) and (i32 (bitcast v32i1)) without AVX512 to prevent scalarization

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 26 12:32:27 PST 2018


Author: ctopper
Date: Mon Feb 26 12:32:27 2018
New Revision: 326119

URL: http://llvm.org/viewvc/llvm-project?rev=326119&view=rev
Log:
[X86] Add a custom legalization for (i16 (bitcast v16i1)) and (i32 (bitcast v32i1)) without AVX512 to prevent scalarization

Summary:
We have an early DAG combine to turn these patterns into MOVMSK, but that combine doesn't work if the vXi1 type has more elements than the widest legal vXi8 type. Type legalization will eventually split it down to v16i1 or v32i1 and then the bitcast gets legalized to a truncstore and a scalar load. The truncstore will get lowered to a series of extracts and bit math.

This patch adds a custom legalization to use a sign extend and MOVMSK instead. This prevents the eventual scalarization.

Reviewers: spatel, RKSimon, zvi

Reviewed By: RKSimon

Subscribers: mgorny, llvm-commits

Differential Revision: https://reviews.llvm.org/D43593

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll
    llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
    llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=326119&r1=326118&r2=326119&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Feb 26 12:32:27 2018
@@ -883,6 +883,8 @@ X86TargetLowering::X86TargetLowering(con
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+    if (!Subtarget.hasAVX512())
+      setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
@@ -1012,6 +1014,9 @@ X86TargetLowering::X86TargetLowering(con
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
+    if (!Subtarget.hasAVX512())
+      setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
@@ -23740,6 +23745,24 @@ static SDValue LowerCMP_SWAP(SDValue Op,
   return SDValue();
 }
 
+// Create MOVMSKB, taking into account whether we need to split for AVX1.
+static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
+  MVT InVT = V.getSimpleValueType();
+
+  if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
+    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
+    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+                     DAG.getConstant(16, DL, MVT::i8));
+    return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+  }
+
+  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+}
+
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   SDValue Src = Op.getOperand(0);
@@ -23765,6 +23788,16 @@ static SDValue LowerBITCAST(SDValue Op,
   if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector())
     return Lower512IntUnary(Op, DAG);
 
+  // Use MOVMSK for vector to scalar conversion to prevent scalarization.
+  if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
+    assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
+    MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
+    SDLoc DL(Op);
+    SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
+    V = getPMOVMSKB(DL, V, DAG, Subtarget);
+    return DAG.getZExtOrTrunc(V, DL, DstVT);
+  }
+
   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
       SrcVT == MVT::i64) {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
@@ -30648,17 +30681,8 @@ static SDValue combineBitcastvxi1(Select
   SDLoc DL(BitCast);
   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
 
-  if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
-    // Handle pre-AVX2 cases by splitting to two v16i1's.
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
-    SDValue Lo = extract128BitVector(V, 0, DAG, DL);
-    SDValue Hi = extract128BitVector(V, 16, DAG, DL);
-    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
-    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
-    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
-                     DAG.getConstant(16, DL, ShiftTy));
-    V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+  if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
+    V = getPMOVMSKB(DL, V, DAG, Subtarget);
     return DAG.getZExtOrTrunc(V, DL, VT);
   }
 

Modified: llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll?rev=326119&r1=326118&r2=326119&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll Mon Feb 26 12:32:27 2018
@@ -490,265 +490,29 @@ define i16 @v16f32(<16 x float> %a, <16
 define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
 ; SSE-LABEL: v64i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
-; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    pcmpgtb %xmm6, %xmm2
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; SSE-NEXT:    pcmpgtb %xmm7, %xmm3
-; SSE-NEXT:    pcmpgtb %xmm4, %xmm0
+; SSE-NEXT:    pcmpgtb %xmm6, %xmm2
 ; SSE-NEXT:    pcmpgtb %xmm5, %xmm1
-; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    pand %xmm2, %xmm8
+; SSE-NEXT:    pcmpgtb %xmm4, %xmm0
 ; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm9
 ; SSE-NEXT:    pand %xmm3, %xmm9
-; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm10
-; SSE-NEXT:    pand %xmm0, %xmm10
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT:    pand %xmm2, %xmm8
 ; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm11
 ; SSE-NEXT:    pand %xmm1, %xmm11
-; SSE-NEXT:    pextrb $1, %xmm11, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm11, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm11, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm11, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm11, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm11, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm11, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm11, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    pextrb $1, %xmm10, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm10, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm10, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm10, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm10, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm10, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm10, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm10, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    pextrb $1, %xmm9, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm9, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm9, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm9, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm9, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm9, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm9, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm9, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    pextrb $1, %xmm8, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
+; SSE-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT:    pand %xmm0, %xmm10
+; SSE-NEXT:    pmovmskb %xmm10, %eax
+; SSE-NEXT:    pmovmskb %xmm11, %ecx
+; SSE-NEXT:    shll $16, %ecx
 ; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm8, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm8, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm8, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm8, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm8, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm8, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm8, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    pmovmskb %xmm8, %edx
+; SSE-NEXT:    pmovmskb %xmm9, %eax
 ; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
-; SSE-NEXT:    shll $16, %edx
-; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    orl %edx, %eax
 ; SSE-NEXT:    shlq $32, %rax
 ; SSE-NEXT:    orq %rcx, %rax
@@ -756,561 +520,51 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8
 ;
 ; AVX1-LABEL: v64i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
-; AVX1-NEXT:    .cfi_offset %rbp, -16
-; AVX1-NEXT:    movq %rsp, %rbp
-; AVX1-NEXT:    .cfi_def_cfa_register %rbp
-; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $64, %rsp
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
 ; AVX1-NEXT:    vpcmpgtb %xmm8, %xmm9, %xmm8
-; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm1, %ymm8
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm9
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm2
-; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpcmpgtb %xmm7, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vandps %ymm0, %ymm8, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
+; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm7, %xmm5, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm9, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
+; AVX1-NEXT:    vpcmpgtb %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    vpmovmskb %xmm1, %ecx
 ; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, (%rsp)
-; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
 ; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, {{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movl (%rsp), %ecx
-; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    vpmovmskb %xmm3, %edx
+; AVX1-NEXT:    vpmovmskb %xmm2, %eax
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    orl %edx, %eax
 ; AVX1-NEXT:    shlq $32, %rax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    movq %rbp, %rsp
-; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v64i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtb %ymm7, %ymm5, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpgtb %ymm6, %ymm4, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, (%rsp)
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl (%rsp), %ecx
-; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb %ymm7, %ymm5, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtb %ymm6, %ymm4, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    vpmovmskb %ymm1, %eax
 ; AVX2-NEXT:    shlq $32, %rax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    movq %rbp, %rsp
-; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll?rev=326119&r1=326118&r2=326119&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-128.ll Mon Feb 26 12:32:27 2018
@@ -646,155 +646,42 @@ define i64 @v16i8_widened_with_zeroes(<1
 ; SSE2-SSSE3-LABEL: v16i8_widened_with_zeroes:
 ; SSE2-SSSE3:       # %bb.0: # %entry
 ; SSE2-SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT:    andl $1, %eax
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    shll $4, %ecx
-; SSE2-SSSE3-NEXT:    orl %eax, %ecx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT:    andl $1, %eax
-; SSE2-SSSE3-NEXT:    shll $5, %eax
-; SSE2-SSSE3-NEXT:    orl %ecx, %eax
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    shll $6, %ecx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-SSSE3-NEXT:    andl $1, %edx
-; SSE2-SSSE3-NEXT:    shll $7, %edx
-; SSE2-SSSE3-NEXT:    orl %ecx, %edx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    shll $8, %ecx
-; SSE2-SSSE3-NEXT:    orl %edx, %ecx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-SSSE3-NEXT:    andl $1, %edx
-; SSE2-SSSE3-NEXT:    shll $9, %edx
-; SSE2-SSSE3-NEXT:    orl %ecx, %edx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    shll $10, %ecx
-; SSE2-SSSE3-NEXT:    orl %edx, %ecx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-SSSE3-NEXT:    andl $1, %edx
-; SSE2-SSSE3-NEXT:    shll $11, %edx
-; SSE2-SSSE3-NEXT:    orl %ecx, %edx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    shll $12, %ecx
-; SSE2-SSSE3-NEXT:    orl %edx, %ecx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-SSSE3-NEXT:    andl $1, %edx
-; SSE2-SSSE3-NEXT:    shll $13, %edx
-; SSE2-SSSE3-NEXT:    orl %ecx, %edx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    andl $1, %ecx
-; SSE2-SSSE3-NEXT:    shll $14, %ecx
-; SSE2-SSSE3-NEXT:    orl %edx, %ecx
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-SSSE3-NEXT:    shll $15, %edx
-; SSE2-SSSE3-NEXT:    orl %ecx, %edx
-; SSE2-SSSE3-NEXT:    orl %eax, %edx
-; SSE2-SSSE3-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT:    movzwl -{{[0-9]+}}(%rsp), %edx
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %ecx
+; SSE2-SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %edx
 ; SSE2-SSSE3-NEXT:    movl %edx, %eax
 ; SSE2-SSSE3-NEXT:    shll $16, %eax
-; SSE2-SSSE3-NEXT:    orl %eax, %edx
-; SSE2-SSSE3-NEXT:    shlq $32, %rdx
-; SSE2-SSSE3-NEXT:    orl %ecx, %eax
-; SSE2-SSSE3-NEXT:    orq %rdx, %rax
+; SSE2-SSSE3-NEXT:    orl %eax, %ecx
+; SSE2-SSSE3-NEXT:    orl %edx, %eax
+; SSE2-SSSE3-NEXT:    shlq $32, %rax
+; SSE2-SSSE3-NEXT:    orq %rcx, %rax
 ; SSE2-SSSE3-NEXT:    retq
 ;
-; AVX12-LABEL: v16i8_widened_with_zeroes:
-; AVX12:       # %bb.0: # %entry
-; AVX12-NEXT:    pushq %rbp
-; AVX12-NEXT:    .cfi_def_cfa_offset 16
-; AVX12-NEXT:    .cfi_offset %rbp, -16
-; AVX12-NEXT:    movq %rsp, %rbp
-; AVX12-NEXT:    .cfi_def_cfa_register %rbp
-; AVX12-NEXT:    andq $-32, %rsp
-; AVX12-NEXT:    subq $64, %rsp
-; AVX12-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX12-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX12-NEXT:    andl $1, %eax
-; AVX12-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX12-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX12-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX12-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    shll $4, %ecx
-; AVX12-NEXT:    orl %eax, %ecx
-; AVX12-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX12-NEXT:    andl $1, %eax
-; AVX12-NEXT:    shll $5, %eax
-; AVX12-NEXT:    orl %ecx, %eax
-; AVX12-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    shll $6, %ecx
-; AVX12-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX12-NEXT:    andl $1, %edx
-; AVX12-NEXT:    shll $7, %edx
-; AVX12-NEXT:    orl %ecx, %edx
-; AVX12-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    shll $8, %ecx
-; AVX12-NEXT:    orl %edx, %ecx
-; AVX12-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX12-NEXT:    andl $1, %edx
-; AVX12-NEXT:    shll $9, %edx
-; AVX12-NEXT:    orl %ecx, %edx
-; AVX12-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    shll $10, %ecx
-; AVX12-NEXT:    orl %edx, %ecx
-; AVX12-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX12-NEXT:    andl $1, %edx
-; AVX12-NEXT:    shll $11, %edx
-; AVX12-NEXT:    orl %ecx, %edx
-; AVX12-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    shll $12, %ecx
-; AVX12-NEXT:    orl %edx, %ecx
-; AVX12-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX12-NEXT:    andl $1, %edx
-; AVX12-NEXT:    shll $13, %edx
-; AVX12-NEXT:    orl %ecx, %edx
-; AVX12-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX12-NEXT:    andl $1, %ecx
-; AVX12-NEXT:    shll $14, %ecx
-; AVX12-NEXT:    orl %edx, %ecx
-; AVX12-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX12-NEXT:    andl $1, %edx
-; AVX12-NEXT:    shll $15, %edx
-; AVX12-NEXT:    orl %ecx, %edx
-; AVX12-NEXT:    orl %eax, %edx
-; AVX12-NEXT:    movl %edx, (%rsp)
-; AVX12-NEXT:    movl $0, {{[0-9]+}}(%rsp)
-; AVX12-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; AVX12-NEXT:    shlq $32, %rcx
-; AVX12-NEXT:    movl (%rsp), %eax
-; AVX12-NEXT:    orq %rcx, %rax
-; AVX12-NEXT:    movq %rbp, %rsp
-; AVX12-NEXT:    popq %rbp
-; AVX12-NEXT:    retq
+; AVX1-LABEL: v16i8_widened_with_zeroes:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    movl %edx, %eax
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    orl %edx, %eax
+; AVX1-NEXT:    shlq $32, %rax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i8_widened_with_zeroes:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovmskb %ymm1, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    vmovdqa %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v16i8_widened_with_zeroes:
 ; AVX512F:       # %bb.0: # %entry

Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll?rev=326119&r1=326118&r2=326119&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll Mon Feb 26 12:32:27 2018
@@ -204,253 +204,17 @@ define i16 @v16f32(<16 x float> %a, <16
 define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; SSE-LABEL: v64i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pcmpgtb %xmm5, %xmm1
-; SSE-NEXT:    pextrb $1, %xmm1, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm1, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm1, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm1, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm1, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm1, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm1, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm1, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    pcmpgtb %xmm4, %xmm0
-; SSE-NEXT:    pextrb $1, %xmm0, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm0, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm0, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm0, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm0, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm0, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm0, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm0, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    pcmpgtb %xmm7, %xmm3
-; SSE-NEXT:    pextrb $1, %xmm3, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    pcmpgtb %xmm5, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %ecx
+; SSE-NEXT:    shll $16, %ecx
 ; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm3, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm3, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm3, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm3, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm3, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm3, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm3, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    pcmpgtb %xmm6, %xmm2
-; SSE-NEXT:    pextrb $1, %xmm2, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    pextrb $0, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE-NEXT:    pextrb $2, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE-NEXT:    pextrb $3, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE-NEXT:    pextrb $4, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $4, %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    pextrb $5, %xmm2, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $5, %eax
-; SSE-NEXT:    orl %ecx, %eax
-; SSE-NEXT:    pextrb $6, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $6, %ecx
-; SSE-NEXT:    pextrb $7, %xmm2, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $7, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $8, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $9, %xmm2, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $9, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $10, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $10, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $11, %xmm2, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $11, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $12, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $12, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $13, %xmm2, %edx
-; SSE-NEXT:    andl $1, %edx
-; SSE-NEXT:    shll $13, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    pextrb $14, %xmm2, %ecx
-; SSE-NEXT:    andl $1, %ecx
-; SSE-NEXT:    shll $14, %ecx
-; SSE-NEXT:    orl %edx, %ecx
-; SSE-NEXT:    pextrb $15, %xmm2, %edx
-; SSE-NEXT:    shll $15, %edx
-; SSE-NEXT:    orl %ecx, %edx
-; SSE-NEXT:    orl %eax, %edx
-; SSE-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    pmovmskb %xmm2, %edx
+; SSE-NEXT:    pcmpgtb %xmm7, %xmm3
+; SSE-NEXT:    pmovmskb %xmm3, %eax
 ; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT:    orl %eax, %ecx
-; SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
-; SSE-NEXT:    shll $16, %edx
-; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    orl %edx, %eax
 ; SSE-NEXT:    shlq $32, %rax
 ; SSE-NEXT:    orq %rcx, %rax
@@ -458,541 +222,35 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8
 ;
 ; AVX1-LABEL: v64i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
-; AVX1-NEXT:    .cfi_offset %rbp, -16
-; AVX1-NEXT:    movq %rsp, %rbp
-; AVX1-NEXT:    .cfi_def_cfa_register %rbp
-; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $64, %rsp
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm4
-; AVX1-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
+; AVX1-NEXT:    vpmovmskb %xmm4, %eax
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
 ; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, (%rsp)
-; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm0
-; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
 ; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, {{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movl (%rsp), %ecx
-; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    orl %edx, %eax
 ; AVX1-NEXT:    shlq $32, %rax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    movq %rbp, %rsp
-; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v64i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, (%rsp)
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
 ; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm0
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl (%rsp), %ecx
-; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
 ; AVX2-NEXT:    shlq $32, %rax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    movq %rbp, %rsp
-; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;




More information about the llvm-commits mailing list