[llvm] r320120 - [X86] Handle alls version of vXi1 insert_vector_elt with a constant index without falling back to shuffles.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 7 16:16:09 PST 2017


Author: ctopper
Date: Thu Dec  7 16:16:09 2017
New Revision: 320120

URL: http://llvm.org/viewvc/llvm-project?rev=320120&view=rev
Log:
[X86] Handle alls version of vXi1 insert_vector_elt with a constant index without falling back to shuffles.

We previously only supported inserting to the LSB or MSB where it was easy to zero to perform an OR to insert.

This change effectively extracts the old value and the new value, xors them together and then xors that single bit with the correct location in the original vector. This will cancel out the old value in the first xor leaving the new value in the position.

The way I've implemented this uses 3 shifts and two xors and uses an additional register. We can avoid the additional register at the cost of another shift.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Dec  7 16:16:09 2017
@@ -14699,21 +14699,14 @@ static SDValue InsertBitToMaskVector(SDV
   // If the kshift instructions of the correct width aren't natively supported
   // then we need to promote the vector to the native size to get the correct
   // zeroing behavior.
-  bool HasNativeShift = true;
   if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
-    HasNativeShift = false;
-    // For now don't do this if we are going to end up using the shuffle
-    // below. This minimizes test diffs.
-    // TODO: Remove this restriction once we no longer need a shuffle fallback.
-    if (Vec.isUndef() || IdxVal == 0) {
-      // Need to promote to v16i1, do the insert, then extract back.
-      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
-                        DAG.getUNDEF(MVT::v16i1), Vec,
-                        DAG.getIntPtrConstant(0, dl));
-      Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
-                         DAG.getIntPtrConstant(0, dl));
-    }
+    // Need to promote to v16i1, do the insert, then extract back.
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+                      DAG.getUNDEF(MVT::v16i1), Vec,
+                      DAG.getIntPtrConstant(0, dl));
+    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
+                       DAG.getIntPtrConstant(0, dl));
   }
 
   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
@@ -14741,7 +14734,7 @@ static SDValue InsertBitToMaskVector(SDV
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
   // Insertion of one bit into last position
-  if (HasNativeShift && IdxVal == NumElems - 1) {
+  if (IdxVal == NumElems - 1) {
     // Move the bit to the last position inside the vector.
     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                            DAG.getConstant(IdxVal, dl, MVT::i8));
@@ -14754,12 +14747,20 @@ static SDValue InsertBitToMaskVector(SDV
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
 
-  // Use shuffle to insert element.
-  SmallVector<int, 64> MaskVec(NumElems);
-  for (unsigned i = 0; i != NumElems; ++i)
-    MaskVec[i] = (i == IdxVal) ? NumElems : i;
-
-  return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
+  // Move the current value of the bit to be replace to bit 0.
+  SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
+                               DAG.getConstant(IdxVal, dl, MVT::i8));
+  // Xor with the new bit.
+  Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
+  // Shift to MSB, filling bottom bits with 0.
+  Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
+                       DAG.getConstant(NumElems - 1, dl, MVT::i8));
+  // Shift to the final position, filling upper bits with 0.
+  Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
+                       DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
+  // Xor with original vector to cancel out the original bit value that's still
+  // present.
+  return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
 }
 
 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Thu Dec  7 16:16:09 2017
@@ -309,31 +309,28 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL-LABEL: test16:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movb (%rdi), %al
-; KNL-NEXT:    kmovw %esi, %k1
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; KNL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; KNL-NEXT:    vpslld $31, %zmm2, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftrw $10, %k0, %k2
+; KNL-NEXT:    kxorw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $5, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def %ax killed %ax killed %eax
-; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test16:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    vpmovm2d %k0, %zmm0
-; SKX-NEXT:    vpmovm2d %k1, %zmm1
-; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; SKX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; SKX-NEXT:    vpmovd2m %zmm2, %k0
+; SKX-NEXT:    kshiftrw $10, %k1, %k2
+; SKX-NEXT:    kxorw %k0, %k2, %k0
+; SKX-NEXT:    kshiftlw $15, %k0, %k0
+; SKX-NEXT:    kshiftrw $5, %k0, %k0
+; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def %ax killed %ax killed %eax
-; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = load i1 , i1 * %addr, align 128
   %a1 = bitcast i16 %a to <16 x i1>
@@ -346,31 +343,28 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movb (%rdi), %al
-; KNL-NEXT:    kmovw %esi, %k1
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftrw $4, %k0, %k2
+; KNL-NEXT:    kxorw %k1, %k2, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $11, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def %al killed %al killed %eax
-; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test17:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    kmovd %esi, %k1
-; SKX-NEXT:    vpmovm2q %k0, %zmm0
-; SKX-NEXT:    vpmovm2q %k1, %zmm1
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; SKX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; SKX-NEXT:    vpmovq2m %zmm2, %k0
+; SKX-NEXT:    kshiftrb $4, %k1, %k2
+; SKX-NEXT:    kxorb %k0, %k2, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $3, %k0, %k0
+; SKX-NEXT:    kxorb %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def %al killed %al killed %eax
-; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = load i1 , i1 * %addr, align 128
   %a1 = bitcast i8 %a to <8 x i1>
@@ -962,12 +956,12 @@ define i32 @test_insertelement_v32i1(i32
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckwd %k0, %k1, %k0
-; SKX-NEXT:    vpmovm2w %k0, %zmm0
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    vpmovm2w %k0, %zmm1
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; SKX-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; SKX-NEXT:    vpmovw2m %zmm2, %k0
+; SKX-NEXT:    kshiftrd $4, %k0, %k1
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kxord %k2, %k1, %k1
+; SKX-NEXT:    kshiftld $31, %k1, %k1
+; SKX-NEXT:    kshiftrd $27, %k1, %k1
+; SKX-NEXT:    kxord %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -988,37 +982,33 @@ define i8 @test_iinsertelement_v4i1(i32
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    vpextrb $4, %xmm0, %ecx
-; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    kmovw %ecx, %k0
 ; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
 ; KNL-NEXT:    andl $1, %ecx
-; KNL-NEXT:    kmovw %ecx, %k0
-; KNL-NEXT:    kshiftrw $1, %k0, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k0, %k1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kshiftrw $1, %k0, %k2
+; KNL-NEXT:    kshiftlw $1, %k2, %k2
+; KNL-NEXT:    korw %k1, %k2, %k1
+; KNL-NEXT:    kshiftrw $1, %k1, %k2
+; KNL-NEXT:    kxorw %k0, %k2, %k0
+; KNL-NEXT:    kshiftlw $15, %k0, %k0
+; KNL-NEXT:    kshiftrw $14, %k0, %k0
+; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kshiftrw $2, %k0, %k1
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $13, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k0
+; KNL-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL-NEXT:    vpextrb $12, %xmm0, %eax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $12, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def %al killed %al killed %eax
-; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_iinsertelement_v4i1:
@@ -1026,12 +1016,12 @@ define i8 @test_iinsertelement_v4i1(i32
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm1
-; SKX-NEXT:    vpbroadcastq %xmm1, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; SKX-NEXT:    vpmovd2m %xmm0, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k1
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kxorw %k2, %k1, %k1
+; SKX-NEXT:    kshiftlw $15, %k1, %k1
+; SKX-NEXT:    kshiftrw $13, %k1, %k1
+; SKX-NEXT:    kxorw %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def %al killed %al killed %eax
 ; SKX-NEXT:    retq
@@ -1057,17 +1047,15 @@ define i8 @test_iinsertelement_v2i1(i32
 ; KNL-NEXT:    kmovw %ecx, %k0
 ; KNL-NEXT:    kshiftrw $1, %k0, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k0, %k1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k1
+; KNL-NEXT:    kmovw %eax, %k2
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $14, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def %al killed %al killed %eax
-; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_iinsertelement_v2i1:
@@ -1075,11 +1063,12 @@ define i8 @test_iinsertelement_v2i1(i32
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    vpmovm2q %k0, %xmm1
-; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SKX-NEXT:    vpmovq2m %xmm0, %k0
+; SKX-NEXT:    kshiftrw $1, %k0, %k1
+; SKX-NEXT:    kmovd %eax, %k2
+; SKX-NEXT:    kxorw %k2, %k1, %k1
+; SKX-NEXT:    kshiftlw $15, %k1, %k1
+; SKX-NEXT:    kshiftrw $14, %k1, %k1
+; SKX-NEXT:    kxorw %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def %al killed %al killed %eax
 ; SKX-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Thu Dec  7 16:16:09 2017
@@ -972,14 +972,11 @@ define <64 x i8> @test16(i64 %x) {
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    movb $1, %al
 ; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vpmovm2b %k1, %zmm0
-; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
-; SKX-NEXT:    vpmovm2b %k0, %zmm1
-; SKX-NEXT:    movl $32, %eax
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    kshiftrq $5, %k0, %k2
+; SKX-NEXT:    kxorq %k1, %k2, %k1
+; SKX-NEXT:    kshiftlq $63, %k1, %k1
+; SKX-NEXT:    kshiftrq $58, %k1, %k1
+; SKX-NEXT:    kxorq %k0, %k1, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
 ;
@@ -988,13 +985,11 @@ define <64 x i8> @test16(i64 %x) {
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
 ; AVX512BW-NEXT:    movb $1, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512BW-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k2
+; AVX512BW-NEXT:    kxorq %k1, %k2, %k1
+; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
+; AVX512BW-NEXT:    kxorq %k0, %k1, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1085,14 +1080,11 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
 ; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vpmovm2b %k1, %zmm0
-; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
-; SKX-NEXT:    vpmovm2b %k0, %zmm1
-; SKX-NEXT:    movl $32, %eax
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    kshiftrq $5, %k0, %k2
+; SKX-NEXT:    kxorq %k1, %k2, %k1
+; SKX-NEXT:    kshiftlq $63, %k1, %k1
+; SKX-NEXT:    kshiftrq $58, %k1, %k1
+; SKX-NEXT:    kxorq %k0, %k1, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
 ;
@@ -1102,13 +1094,11 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 ; AVX512BW-NEXT:    cmpl %edx, %esi
 ; AVX512BW-NEXT:    setg %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512BW-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k2
+; AVX512BW-NEXT:    kxorq %k1, %k2, %k1
+; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
+; AVX512BW-NEXT:    kxorq %k0, %k1, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1159,24 +1149,22 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-LABEL: test18:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k2
+; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kshiftlw $6, %k1, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kshiftlw $6, %k0, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k3
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,8]
-; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT:    kshiftrw $6, %k0, %k3
+; KNL-NEXT:    kxorw %k1, %k3, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $9, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k0
+; KNL-NEXT:    kshiftrw $7, %k0, %k1
+; KNL-NEXT:    kxorw %k2, %k1, %k1
+; KNL-NEXT:    kshiftlw $15, %k1, %k1
+; KNL-NEXT:    kshiftrw $8, %k1, %k1
+; KNL-NEXT:    kxorw %k0, %k1, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %ymm0
@@ -1185,45 +1173,42 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ;
 ; SKX-LABEL: test18:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    kmovd %esi, %k2
-; SKX-NEXT:    kshiftlw $7, %k2, %k0
-; SKX-NEXT:    kshiftrw $15, %k0, %k0
-; SKX-NEXT:    kshiftlw $6, %k2, %k2
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kshiftlw $7, %k1, %k2
 ; SKX-NEXT:    kshiftrw $15, %k2, %k2
-; SKX-NEXT:    vpmovm2q %k1, %zmm0
-; SKX-NEXT:    vpmovm2q %k2, %zmm1
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; SKX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; SKX-NEXT:    vpmovq2m %zmm2, %k1
-; SKX-NEXT:    kshiftlb $1, %k1, %k1
+; SKX-NEXT:    kshiftlw $6, %k1, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kshiftrb $6, %k0, %k3
+; SKX-NEXT:    kxorb %k1, %k3, %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $1, %k1, %k1
-; SKX-NEXT:    kshiftlb $7, %k0, %k0
-; SKX-NEXT:    korb %k0, %k1, %k0
+; SKX-NEXT:    kxorb %k0, %k1, %k0
+; SKX-NEXT:    kshiftlb $1, %k0, %k0
+; SKX-NEXT:    kshiftrb $1, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
-; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test18:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k2
-; AVX512BW-NEXT:    kmovd %esi, %k0
-; AVX512BW-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kshiftlw $7, %k1, %k2
+; AVX512BW-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $6, %k1, %k1
 ; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512BW-NEXT:    kshiftlw $6, %k0, %k0
-; AVX512BW-NEXT:    kshiftrw $15, %k0, %k3
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512BW-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k2
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512BW-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,8]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k3
+; AVX512BW-NEXT:    kxorw %k1, %k3, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT:    kxorw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512BW-NEXT:    kxorw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    kxorw %k0, %k1, %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512BW-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1231,21 +1216,21 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ;
 ; AVX512DQ-LABEL: test18:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %edi, %k1
-; AVX512DQ-NEXT:    kmovw %esi, %k2
-; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k0
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k2
 ; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k2
-; AVX512DQ-NEXT:    vpmovm2q %k1, %zmm0
-; AVX512DQ-NEXT:    vpmovm2q %k2, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vpmovq2m %zmm2, %k1
-; AVX512DQ-NEXT:    kshiftlb $1, %k1, %k1
+; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrb $6, %k0, %k3
+; AVX512DQ-NEXT:    kxorb %k1, %k3, %k1
+; AVX512DQ-NEXT:    kshiftlb $7, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k1
-; AVX512DQ-NEXT:    kshiftlb $7, %k0, %k0
-; AVX512DQ-NEXT:    korb %k0, %k1, %k0
+; AVX512DQ-NEXT:    kxorb %k0, %k1, %k0
+; AVX512DQ-NEXT:    kshiftlb $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k2, %k1
+; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512DQ-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %ymm0

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Thu Dec  7 16:16:09 2017
@@ -7325,14 +7325,11 @@ define <64 x i8> @vmov_test16(i64 %x) {
 ; GENERIC-NEXT:    kmovq %rdi, %k0 # sched: [1:0.33]
 ; GENERIC-NEXT:    movb $1, %al # sched: [1:0.33]
 ; GENERIC-NEXT:    kmovd %eax, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovm2b %k1, %zmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpmovm2b %k0, %zmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    movl $32, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    kmovd %eax, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [2:1.00]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vpmovb2m %zmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT:    kshiftrq $5, %k0, %k2 # sched: [1:1.00]
+; GENERIC-NEXT:    kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftlq $63, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftrq $58, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kxorq %k0, %k1, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovm2b %k0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7341,14 +7338,11 @@ define <64 x i8> @vmov_test16(i64 %x) {
 ; SKX-NEXT:    kmovq %rdi, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    movb $1, %al # sched: [1:0.25]
 ; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vpmovm2b %k1, %zmm0 # sched: [1:0.25]
-; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpmovm2b %k0, %zmm1 # sched: [1:0.25]
-; SKX-NEXT:    movl $32, %eax # sched: [1:0.25]
-; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vpmovb2m %zmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kshiftrq $5, %k0, %k2 # sched: [3:1.00]
+; SKX-NEXT:    kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; SKX-NEXT:    kshiftlq $63, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kshiftrq $58, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kxorq %k0, %k1, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = bitcast i64 %x to <64 x i1>
@@ -7365,14 +7359,11 @@ define <64 x i8> @vmov_test17(i64 %x, i3
 ; GENERIC-NEXT:    cmpl %edx, %esi # sched: [1:0.33]
 ; GENERIC-NEXT:    setg %al # sched: [1:0.50]
 ; GENERIC-NEXT:    kmovd %eax, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovm2b %k1, %zmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vpmovm2b %k0, %zmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    movl $32, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    kmovd %eax, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [2:1.00]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vpmovb2m %zmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT:    kshiftrq $5, %k0, %k2 # sched: [1:1.00]
+; GENERIC-NEXT:    kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftlq $63, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftrq $58, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kxorq %k0, %k1, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovm2b %k0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -7382,14 +7373,11 @@ define <64 x i8> @vmov_test17(i64 %x, i3
 ; SKX-NEXT:    cmpl %edx, %esi # sched: [1:0.25]
 ; SKX-NEXT:    setg %al # sched: [1:0.50]
 ; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vpmovm2b %k1, %zmm0 # sched: [1:0.25]
-; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpmovm2b %k0, %zmm1 # sched: [1:0.25]
-; SKX-NEXT:    movl $32, %eax # sched: [1:0.25]
-; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vpmovb2m %zmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kshiftrq $5, %k0, %k2 # sched: [3:1.00]
+; SKX-NEXT:    kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; SKX-NEXT:    kshiftlq $63, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kshiftrq $58, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kxorq %k0, %k1, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0 # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = bitcast i64 %x to <64 x i1>
@@ -7402,44 +7390,42 @@ define <64 x i8> @vmov_test17(i64 %x, i3
 define <8 x i1> @vmov_test18(i8 %a, i16 %y) {
 ; GENERIC-LABEL: vmov_test18:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovd %edi, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    kmovd %esi, %k2 # sched: [1:0.33]
-; GENERIC-NEXT:    kshiftlw $7, %k2, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    kshiftrw $15, %k0, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    kshiftlw $6, %k2, %k2 # sched: [1:1.00]
+; GENERIC-NEXT:    kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT:    kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT:    kshiftlw $7, %k1, %k2 # sched: [1:1.00]
 ; GENERIC-NEXT:    kshiftrw $15, %k2, %k2 # sched: [1:1.00]
-; GENERIC-NEXT:    vpmovm2q %k1, %zmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vpmovm2q %k2, %zmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 # sched: [1:1.00]
-; GENERIC-NEXT:    vpmovq2m %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    kshiftlb $1, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftlw $6, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftrw $15, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftrb $6, %k0, %k3 # sched: [1:1.00]
+; GENERIC-NEXT:    kxorb %k1, %k3, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftlb $7, %k1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    kshiftrb $1, %k1, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    kshiftlb $7, %k0, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    korb %k0, %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kxorb %k0, %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftlb $1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftrb $1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kshiftlb $7, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT:    korb %k1, %k0, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpmovm2w %k0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vzeroupper
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vmov_test18:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    kmovd %edi, %k1 # sched: [1:1.00]
-; SKX-NEXT:    kmovd %esi, %k2 # sched: [1:1.00]
-; SKX-NEXT:    kshiftlw $7, %k2, %k0 # sched: [3:1.00]
-; SKX-NEXT:    kshiftrw $15, %k0, %k0 # sched: [3:1.00]
-; SKX-NEXT:    kshiftlw $6, %k2, %k2 # sched: [3:1.00]
+; SKX-NEXT:    kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT:    kshiftlw $7, %k1, %k2 # sched: [3:1.00]
 ; SKX-NEXT:    kshiftrw $15, %k2, %k2 # sched: [3:1.00]
-; SKX-NEXT:    vpmovm2q %k1, %zmm0 # sched: [1:0.25]
-; SKX-NEXT:    vpmovm2q %k2, %zmm1 # sched: [1:0.25]
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [8:0.50]
-; SKX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 # sched: [3:1.00]
-; SKX-NEXT:    vpmovq2m %zmm2, %k1 # sched: [1:1.00]
-; SKX-NEXT:    kshiftlb $1, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kshiftlw $6, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kshiftrw $15, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    kshiftrb $6, %k0, %k3 # sched: [3:1.00]
+; SKX-NEXT:    kxorb %k1, %k3, %k1 # sched: [1:1.00]
+; SKX-NEXT:    kshiftlb $7, %k1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    kshiftrb $1, %k1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    kshiftlb $7, %k0, %k0 # sched: [3:1.00]
-; SKX-NEXT:    korb %k0, %k1, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kxorb %k0, %k1, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kshiftlb $1, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    kshiftrb $1, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    kshiftlb $7, %k2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    korb %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0 # sched: [1:0.25]
-; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %b = bitcast i8 %a to <8 x i1>
   %b1 = bitcast i16 %y to <16 x i1>

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll Thu Dec  7 16:16:09 2017
@@ -120,713 +120,537 @@ define <8 x i64> @test_mm512_mask_set1_e
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset %esi, -12
 ; X32-NEXT:    .cfi_offset %ebx, -8
-; X32-NEXT:    vmovdqa64 %zmm0, %zmm3
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpsllw $8, %xmm1, %xmm1
-; X32-NEXT:    kmovd %eax, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kshiftrq $1, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $62, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $2, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpslld $24, %xmm2, %xmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $5, %cl
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpsllq $56, %xmm1, %xmm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastq %xmm1, %xmm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $61, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $3, %k0, %k1
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $60, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $4, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $4, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $59, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $5, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $5, %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $58, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $6, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $6, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $57, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $7, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $7, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $56, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $8, %k0, %k1
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $13, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpsllw $8, %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $12, %eax
+; X32-NEXT:    andl $15, %eax
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $13, %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k3
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $14, %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    kmovd %eax, %k4
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $15, %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    kmovd %eax, %k5
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $16, %edx
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k6
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $15, %bl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    shrb $2, %al
+; X32-NEXT:    kmovd %eax, %k7
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $52, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $51, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $13, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $50, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $14, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $49, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $15, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $48, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $16, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $47, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $17, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $46, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $18, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $45, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $19, %k0, %k1
+; X32-NEXT:    shrb $3, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $44, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $20, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $4, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $43, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $21, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $5, %bl
+; X32-NEXT:    andb $1, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $42, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $22, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $6, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $41, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $23, %k0, %k1
+; X32-NEXT:    shrb $7, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $40, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $24, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $24, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $39, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $25, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $2, %bl
+; X32-NEXT:    shrb %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $38, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $26, %k0, %k1
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $37, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $27, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslld $24, %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $4, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $36, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $28, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $28, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $35, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $29, %k0, %k1
 ; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $5, %dl
+; X32-NEXT:    shrl $29, %edx
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpsllq $40, %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpsllq $56, %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $24, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastq %xmm1, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; X32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm1, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $29, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrl $31, %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; X32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k1, %zmm7
-; X32-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm6, %ymm1, %ymm7, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $34, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $30, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $30, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $33, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $31, %k0, %k1
+; X32-NEXT:    shrl $31, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $32, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $32, %k0, %k1
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $31, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $33, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $30, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $34, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $29, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $35, %k0, %k1
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $28, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $36, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $27, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $37, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $5, %cl
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $26, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $38, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $25, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $39, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $24, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $40, %k0, %k1
 ; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
 ; X32-NEXT:    movb %ah, %cl
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $13, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ebx, %k7
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $23, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $41, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $22, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $42, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $21, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $43, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k3
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k4
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $20, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $19, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $45, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $18, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $46, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $17, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $47, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $16, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $48, %k0, %k1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $15, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $49, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $14, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $50, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $13, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $51, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $12, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k4
+; X32-NEXT:    kshiftrq $52, %k4, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kxorq %k1, %k0, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $5, %dl
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k7
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k0
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $24, %ecx
 ; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vpblendvb %ymm5, %ymm6, %ymm0, %ymm0
+; X32-NEXT:    kmovd %edx, %k2
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
-; X32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm1, %ymm2
+; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kshiftlq $63, %k5, %k5
+; X32-NEXT:    kshiftrq $11, %k5, %k5
+; X32-NEXT:    kxorq %k4, %k5, %k4
+; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k6, %k5, %k5
+; X32-NEXT:    kshiftlq $63, %k5, %k5
+; X32-NEXT:    kshiftrq $10, %k5, %k5
+; X32-NEXT:    kxorq %k4, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k4
+; X32-NEXT:    kxorq %k7, %k4, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k2
-; X32-NEXT:    vpmovm2b %k2, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k7
+; X32-NEXT:    kshiftlq $63, %k6, %k6
+; X32-NEXT:    kshiftrq $9, %k6, %k6
+; X32-NEXT:    kxorq %k5, %k6, %k5
+; X32-NEXT:    kshiftrq $55, %k5, %k6
+; X32-NEXT:    kxorq %k0, %k6, %k0
+; X32-NEXT:    kshiftlq $63, %k0, %k0
+; X32-NEXT:    kshiftrq $8, %k0, %k0
+; X32-NEXT:    kxorq %k5, %k0, %k0
+; X32-NEXT:    kshiftrq $56, %k0, %k5
+; X32-NEXT:    kxorq %k1, %k5, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k1, %zmm2
-; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k6
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $7, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $57, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $6, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $58, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $5, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $59, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $4, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $60, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $3, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $61, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $2, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $62, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
-; X32-NEXT:    kmovd %eax, %k1
+; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $1, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftlq $1, %k0, %k0
 ; X32-NEXT:    kshiftrq $1, %k0, %k0
-; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k2, %k1
 ; X32-NEXT:    korq %k1, %k0, %k1
-; X32-NEXT:    vpbroadcastb %eax, %zmm3 {%k1}
-; X32-NEXT:    vmovdqa64 %zmm3, %zmm0
+; X32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
@@ -850,710 +674,537 @@ define <8 x i64> @test_mm512_maskz_set1_
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset %esi, -12
 ; X32-NEXT:    .cfi_offset %ebx, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    kmovd %eax, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kshiftrq $1, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $62, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $2, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpslld $24, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $5, %cl
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpsllq $40, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpsllq $56, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $61, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $3, %k0, %k1
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $60, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $4, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $4, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $59, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $5, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $5, %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $58, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $6, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $6, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $57, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $7, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $7, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $56, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $8, %k0, %k1
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $13, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $15, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $12, %eax
+; X32-NEXT:    andl $15, %eax
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $13, %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k3
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $14, %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    kmovd %eax, %k4
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $15, %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    kmovd %eax, %k5
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $16, %edx
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k6
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $4, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $5, %dl
-; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $24, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; X32-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $29, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; X32-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; X32-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrl $31, %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; X32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    andb $15, %bl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    shrb $2, %al
+; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k1, %zmm7
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $52, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $51, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $13, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $50, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $14, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $49, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $15, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $48, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $16, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $47, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $17, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $46, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $18, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $45, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $19, %k0, %k1
+; X32-NEXT:    shrb $3, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $44, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $20, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $4, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $43, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $21, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $5, %bl
+; X32-NEXT:    andb $1, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $42, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $22, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $6, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $41, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $23, %k0, %k1
+; X32-NEXT:    shrb $7, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $40, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $24, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $24, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $39, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $25, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $2, %bl
+; X32-NEXT:    shrb %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $38, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $26, %k0, %k1
+; X32-NEXT:    andb $15, %dl
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $2, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $37, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $27, %k0, %k1
+; X32-NEXT:    shrb $3, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $36, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $28, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $28, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $35, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $29, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $29, %edx
+; X32-NEXT:    andb $1, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $34, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $30, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $30, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $33, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $31, %k0, %k1
+; X32-NEXT:    shrl $31, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $32, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $32, %k0, %k1
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $31, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $33, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $30, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $34, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $29, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $35, %k0, %k1
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $28, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $36, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $27, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $37, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $5, %cl
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $26, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $38, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $25, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $39, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $24, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $40, %k0, %k1
 ; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
 ; X32-NEXT:    movb %ah, %cl
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $13, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ebx, %k7
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $23, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $41, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $22, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $42, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $21, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $43, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k3
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k4
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $20, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $19, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $45, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $18, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $46, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $17, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $47, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $16, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $48, %k0, %k1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $15, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $49, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $14, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $50, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $13, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $51, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $12, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k4
+; X32-NEXT:    kshiftrq $52, %k4, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kxorq %k1, %k0, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $5, %dl
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k7
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k0
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $24, %ecx
 ; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
-; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm1
+; X32-NEXT:    kmovd %edx, %k2
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
+; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kshiftlq $63, %k5, %k5
+; X32-NEXT:    kshiftrq $11, %k5, %k5
+; X32-NEXT:    kxorq %k4, %k5, %k4
+; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k6, %k5, %k5
+; X32-NEXT:    kshiftlq $63, %k5, %k5
+; X32-NEXT:    kshiftrq $10, %k5, %k5
+; X32-NEXT:    kxorq %k4, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k4
+; X32-NEXT:    kxorq %k7, %k4, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k2
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k3
-; X32-NEXT:    vpmovm2b %k3, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k1, %zmm2
-; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k7
+; X32-NEXT:    kshiftlq $63, %k6, %k6
+; X32-NEXT:    kshiftrq $9, %k6, %k6
+; X32-NEXT:    kxorq %k5, %k6, %k5
+; X32-NEXT:    kshiftrq $55, %k5, %k6
+; X32-NEXT:    kxorq %k0, %k6, %k0
+; X32-NEXT:    kshiftlq $63, %k0, %k0
+; X32-NEXT:    kshiftrq $8, %k0, %k0
+; X32-NEXT:    kxorq %k5, %k0, %k0
+; X32-NEXT:    kshiftrq $56, %k0, %k5
+; X32-NEXT:    kxorq %k1, %k5, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k2, %zmm2
-; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm2
-; X32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; X32-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT:    kmovd %ecx, %k6
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $7, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $57, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $6, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $58, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $5, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $59, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $4, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $60, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $3, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $61, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $2, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $62, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
-; X32-NEXT:    kmovd %eax, %k0
+; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    vpmovb2m %zmm0, %k1
-; X32-NEXT:    kshiftlq $1, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $1, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k0, %k0
-; X32-NEXT:    korq %k0, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftlq $1, %k0, %k0
+; X32-NEXT:    kshiftrq $1, %k0, %k0
+; X32-NEXT:    kshiftlq $63, %k2, %k1
+; X32-NEXT:    korq %k1, %k0, %k1
 ; X32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z}
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
@@ -2057,719 +1708,541 @@ define i64 @test_mm512_mask_test_epi8_ma
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $256, %esp # imm = 0x100
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_offset %esi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp) # 64-byte Spill
-; X32-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%esp) # 64-byte Spill
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    kmovd %eax, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpslld $24, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $5, %cl
-; X32-NEXT:    andb $1, %cl
+; X32-NEXT:    movl 8(%ebp), %ecx
 ; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpsllq $40, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kshiftrq $1, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $62, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $2, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $13, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $61, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $3, %k0, %k1
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $60, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $4, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $4, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $59, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $5, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $5, %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $58, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $6, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $6, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $57, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $7, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $7, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $56, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $8, %k0, %k1
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    shrb $2, %dl
+; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k4
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $13, %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k5
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $16, %edx
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k6
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $15, %bl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    shrb $2, %al
+; X32-NEXT:    kmovd %eax, %k7
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $52, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $51, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $13, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $50, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $14, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $49, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $15, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $48, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $16, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $47, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $17, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $46, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $18, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $45, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $19, %k0, %k1
+; X32-NEXT:    shrb $3, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $44, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $20, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $4, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $43, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $21, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $5, %bl
+; X32-NEXT:    andb $1, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $42, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $22, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $6, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $41, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $23, %k0, %k1
+; X32-NEXT:    shrb $7, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $40, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $24, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $24, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $39, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $25, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $2, %bl
+; X32-NEXT:    shrb %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $38, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $26, %k0, %k1
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $37, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $27, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $4, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $36, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $28, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $28, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $35, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $29, %k0, %k1
 ; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $5, %dl
+; X32-NEXT:    shrl $29, %edx
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $24, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $29, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; X32-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrl $31, %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; X32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k1, %zmm7
-; X32-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm6, %ymm1, %ymm7, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $34, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $30, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $30, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $33, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $31, %k0, %k1
+; X32-NEXT:    shrl $31, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $32, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $32, %k0, %k1
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $31, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $33, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $30, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $34, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $29, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $35, %k0, %k1
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $28, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $36, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $27, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $37, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $5, %cl
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $26, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $38, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $25, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $39, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $24, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $40, %k0, %k1
 ; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
 ; X32-NEXT:    movb %ah, %cl
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $13, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    shrl $13, %ecx
+; X32-NEXT:    andb $1, %cl
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ebx, %k7
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $23, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $41, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $22, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $42, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $21, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $43, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $20, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $19, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $45, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $18, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $46, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $17, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $47, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $16, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $48, %k0, %k1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $15, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $49, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $14, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $50, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $13, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $51, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $12, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $52, %k0, %k1
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $5, %dl
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $24, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovb2m %zmm0, %k1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
+; X32-NEXT:    kmovd %edx, %k7
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $11, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $53, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $10, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $54, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $9, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $55, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $8, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $56, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $7, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $57, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $6, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $58, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $5, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $59, %k0, %k1
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
-; X32-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $4, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $60, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm5
-; X32-NEXT:    vpbroadcastd %xmm5, %xmm5
-; X32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm5, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $3, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $61, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; X32-NEXT:    vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $2, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $62, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm3
-; X32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; X32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $1, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftlq $1, %k0, %k0
 ; X32-NEXT:    kshiftrq $1, %k0, %k0
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    korq %k1, %k0, %k1
-; X32-NEXT:    vmovdqa64 {{[0-9]+}}(%esp), %zmm0 # 64-byte Reload
-; X32-NEXT:    vmovdqa64 {{[0-9]+}}(%esp), %zmm1 # 64-byte Reload
-; X32-NEXT:    vptestmb %zmm1, %zmm0, %k0 {%k1}
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vptestmb %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT:    kmovq %k0, (%esp)
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    leal -8(%ebp), %esp
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    vzeroupper
@@ -2882,719 +2355,541 @@ define i64 @test_mm512_mask_testn_epi8_m
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $256, %esp # imm = 0x100
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_offset %esi, -16
 ; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp) # 64-byte Spill
-; X32-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%esp) # 64-byte Spill
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    kmovd %eax, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpslld $24, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $5, %cl
-; X32-NEXT:    andb $1, %cl
+; X32-NEXT:    movl 8(%ebp), %ecx
 ; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpsllq $40, %xmm1, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    andb $2, %cl
-; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kshiftrq $1, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $62, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $2, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $13, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $61, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $3, %k0, %k1
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $60, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $4, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $4, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $59, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $5, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $5, %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $58, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $6, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $6, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $57, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $7, %k0, %k1
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrb $7, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $56, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $8, %k0, %k1
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    movb %ch, %al
+; X32-NEXT:    andb $15, %al
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    shrb $2, %dl
+; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    shrb $3, %al
+; X32-NEXT:    kmovd %eax, %k4
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $13, %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    kmovd %eax, %k5
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $16, %edx
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    andb $2, %al
+; X32-NEXT:    shrb %al
+; X32-NEXT:    kmovd %eax, %k6
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $15, %bl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    shrb $2, %al
+; X32-NEXT:    kmovd %eax, %k7
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $52, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $51, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $13, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $50, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $14, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $49, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $15, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $48, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $16, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $47, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $17, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $46, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $18, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $45, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $19, %k0, %k1
+; X32-NEXT:    shrb $3, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $44, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $20, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $4, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $43, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $21, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $5, %bl
+; X32-NEXT:    andb $1, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $42, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $22, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    shrb $6, %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $41, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $23, %k0, %k1
+; X32-NEXT:    shrb $7, %dl
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $40, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $24, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $24, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $39, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $25, %k0, %k1
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    andb $2, %bl
+; X32-NEXT:    shrb %bl
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $38, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $26, %k0, %k1
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ebx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $37, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $27, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $4, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $36, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $28, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $28, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $35, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $29, %k0, %k1
 ; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $5, %dl
+; X32-NEXT:    shrl $29, %edx
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $24, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    andb $2, %dl
-; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $29, %ecx
-; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; X32-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    shrl $31, %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; X32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vpmovm2b %k1, %zmm7
-; X32-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm6, %ymm1, %ymm7, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $34, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $30, %k0, %k1
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    shrl $30, %edx
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $33, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $31, %k0, %k1
+; X32-NEXT:    shrl $31, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $32, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $32, %k0, %k1
+; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $31, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $33, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $30, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $34, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $29, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $35, %k0, %k1
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $28, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $36, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $4, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $27, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $37, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $5, %cl
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $26, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $38, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $6, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $25, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $39, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $24, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $40, %k0, %k1
 ; X32-NEXT:    movb %ah, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    andb $2, %cl
 ; X32-NEXT:    shrb %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
 ; X32-NEXT:    movb %ah, %cl
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $12, %ecx
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $13, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $14, %ecx
-; X32-NEXT:    andl $3, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $15, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $16, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $15, %dl
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    shrb $2, %bl
-; X32-NEXT:    kmovd %ebx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ebx, %k7
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $23, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $41, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $22, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $42, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $21, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $43, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $20, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $19, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $45, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $18, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $46, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $17, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $47, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $16, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $48, %k0, %k1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $15, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $49, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $14, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $50, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $13, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $51, %k0, %k1
 ; X32-NEXT:    shrb $3, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslld $24, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $12, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $52, %k0, %k1
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $5, %dl
 ; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k2
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $6, %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $7, %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k4
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $24, %ecx
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    andb $2, %dl
 ; X32-NEXT:    shrb %dl
-; X32-NEXT:    kmovd %edx, %k1
-; X32-NEXT:    vpmovm2b %k1, %zmm0
-; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT:    kmovd %edx, %k6
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    vpmovb2m %zmm0, %k1
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT:    vpmovm2b %k1, %zmm1
-; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
+; X32-NEXT:    kmovd %edx, %k7
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $11, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $53, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $10, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $54, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $9, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $55, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $8, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $56, %k0, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $7, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $57, %k0, %k1
+; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $6, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $58, %k0, %k1
+; X32-NEXT:    kxorq %k7, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $5, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $59, %k0, %k1
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm1
-; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
-; X32-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $4, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $60, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm5
-; X32-NEXT:    vpbroadcastd %xmm5, %xmm5
-; X32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm5, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $3, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $61, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm4
-; X32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
-; X32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; X32-NEXT:    vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm0
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $2, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $62, %k0, %k1
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k0
-; X32-NEXT:    vpmovm2b %k0, %zmm3
-; X32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; X32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X32-NEXT:    vpmovb2m %zmm0, %k0
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $1, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftlq $1, %k0, %k0
 ; X32-NEXT:    kshiftrq $1, %k0, %k0
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    korq %k1, %k0, %k1
-; X32-NEXT:    vmovdqa64 {{[0-9]+}}(%esp), %zmm0 # 64-byte Reload
-; X32-NEXT:    vmovdqa64 {{[0-9]+}}(%esp), %zmm1 # 64-byte Reload
-; X32-NEXT:    vptestnmb %zmm1, %zmm0, %k0 {%k1}
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vptestnmb %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT:    kmovq %k0, (%esp)
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    leal -8(%ebp), %esp
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Thu Dec  7 16:16:09 2017
@@ -1795,753 +1795,574 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512F-32-NEXT:    pushl %esi
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    subl $60, %esp
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 80
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ecx, %edx
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k0
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    shrl $16, %eax
 ; AVX512F-32-NEXT:    movl %ebx, %edx
-; AVX512F-32-NEXT:    shrb $2, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    shrb $4, %bl
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    kmovd %ecx, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    kmovd %ebx, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %ch, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %ch, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    andb $2, %cl
+; AVX512F-32-NEXT:    shrb %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %edx, %ecx
 ; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $12, %eax
-; AVX512F-32-NEXT:    andl $15, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $14, %eax
-; AVX512F-32-NEXT:    andl $3, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $15, %eax
-; AVX512F-32-NEXT:    andl $1, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $16, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k2
+; AVX512F-32-NEXT:    movb %bh, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k3
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $6, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k6
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $7, %cl
+; AVX512F-32-NEXT:    kmovd %ebx, %k5
+; AVX512F-32-NEXT:    kshiftrq $1, %k5, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $62, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k5, %k1, %k7
+; AVX512F-32-NEXT:    kshiftrq $2, %k7, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k5
+; AVX512F-32-NEXT:    movb %bh, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %ebx, %esi
+; AVX512F-32-NEXT:    andb $2, %cl
+; AVX512F-32-NEXT:    shrb %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $61, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT:    kshiftrq $3, %k7, %k2
+; AVX512F-32-NEXT:    kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    movl %edx, %ecx
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $60, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $4, %k0, %k7
+; AVX512F-32-NEXT:    kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %edx, %k3
 ; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $59, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $5, %k7, %k0
+; AVX512F-32-NEXT:    kxorq %k4, %k0, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $13, %ecx
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $58, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT:    kshiftrq $6, %k7, %k4
+; AVX512F-32-NEXT:    kxorq %k6, %k4, %k6
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
 ; AVX512F-32-NEXT:    movl %eax, %ebx
-; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ebx, %edx
-; AVX512F-32-NEXT:    shrb $2, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    shrb $4, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    shrb $5, %dl
-; AVX512F-32-NEXT:    andb $1, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    andb $2, %bl
+; AVX512F-32-NEXT:    shrb %bl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $57, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $7, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %ebx, %k5
+; AVX512F-32-NEXT:    movl %edx, %ecx
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $56, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $8, %k7, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT:    kmovd %edx, %k6
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $55, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT:    kshiftrq $9, %k7, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $54, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT:    kshiftrq $10, %k7, %k2
+; AVX512F-32-NEXT:    kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    movl %esi, %edx
+; AVX512F-32-NEXT:    shrl $12, %edx
+; AVX512F-32-NEXT:    andl $15, %edx
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $53, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k7, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $11, %k3, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $52, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k3, %k0, %k3
+; AVX512F-32-NEXT:    kshiftrq $12, %k3, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    movl %eax, %edx
 ; AVX512F-32-NEXT:    shrb $6, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $14, %ecx
+; AVX512F-32-NEXT:    andl $3, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $51, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k3, %k7, %k3
+; AVX512F-32-NEXT:    kshiftrq $13, %k3, %k7
+; AVX512F-32-NEXT:    kxorq %k4, %k7, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $15, %ecx
+; AVX512F-32-NEXT:    andl $1, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $50, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $14, %k3, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $49, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $15, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $48, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $16, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %eax, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    movl %esi, %edx
+; AVX512F-32-NEXT:    shrl $24, %edx
 ; AVX512F-32-NEXT:    # kill: def %al killed %al killed %eax def %eax
 ; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $47, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $17, %k3, %k4
+; AVX512F-32-NEXT:    kxorq %k5, %k4, %k4
+; AVX512F-32-NEXT:    kmovd %eax, %k5
 ; AVX512F-32-NEXT:    movl %edx, %eax
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    movl %ecx, %esi
-; AVX512F-32-NEXT:    shrl $29, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $31, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %edx
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $46, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $18, %k4, %k3
+; AVX512F-32-NEXT:    kxorq %k6, %k3, %k6
+; AVX512F-32-NEXT:    kmovd %edx, %k3
+; AVX512F-32-NEXT:    # kill: def %dl killed %dl killed %edx def %edx
 ; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movl %edx, %eax
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $45, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k4, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $19, %k6, %k4
+; AVX512F-32-NEXT:    kxorq %k1, %k4, %k1
+; AVX512F-32-NEXT:    kmovd %eax, %k4
+; AVX512F-32-NEXT:    movl %edx, %ecx
 ; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $12, %eax
-; AVX512F-32-NEXT:    andl $15, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $14, %eax
-; AVX512F-32-NEXT:    andl $3, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $15, %eax
-; AVX512F-32-NEXT:    andl $1, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    shrl $16, %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $44, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k6, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $20, %k1, %k6
+; AVX512F-32-NEXT:    kxorq %k2, %k6, %k6
+; AVX512F-32-NEXT:    kmovd %edx, %k2
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    andb $15, %al
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $43, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT:    kshiftrq $21, %k1, %k6
+; AVX512F-32-NEXT:    kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $29, %ecx
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $42, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $22, %k6, %k1
+; AVX512F-32-NEXT:    kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
 ; AVX512F-32-NEXT:    movl %ebx, %edx
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $41, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $23, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %edx, %k5
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $2, %al
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $40, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $24, %k7, %k6
+; AVX512F-32-NEXT:    kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT:    kmovd %eax, %k6
+; AVX512F-32-NEXT:    movb %bh, %al
 ; AVX512F-32-NEXT:    andb $15, %al
-; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $39, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k7, %k3, %k7
+; AVX512F-32-NEXT:    kshiftrq $25, %k7, %k3
+; AVX512F-32-NEXT:    kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k3
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $38, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT:    kshiftrq $26, %k7, %k4
+; AVX512F-32-NEXT:    kxorq %k2, %k4, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    movl %esi, %edx
+; AVX512F-32-NEXT:    shrl $28, %edx
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $37, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $27, %k2, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $36, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k2, %k0, %k2
+; AVX512F-32-NEXT:    kshiftrq $28, %k2, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %ebx, %edx
+; AVX512F-32-NEXT:    shrb $6, %dl
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $30, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $35, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT:    kshiftrq $29, %k2, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $31, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $34, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $30, %k1, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $33, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT:    kshiftrq $31, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ebx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $7, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $31, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT:    kshiftrq $33, %k1, %k2
+; AVX512F-32-NEXT:    kxorq %k5, %k2, %k2
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $30, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $34, %k2, %k1
+; AVX512F-32-NEXT:    kxorq %k6, %k1, %k5
+; AVX512F-32-NEXT:    kmovd %ecx, %k6
+; AVX512F-32-NEXT:    movb %bh, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    andb $2, %cl
+; AVX512F-32-NEXT:    shrb %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $29, %k5, %k5
+; AVX512F-32-NEXT:    kxorq %k2, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $35, %k5, %k2
+; AVX512F-32-NEXT:    kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    movl %eax, %ecx
 ; AVX512F-32-NEXT:    shrb $2, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $28, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k5, %k3, %k5
+; AVX512F-32-NEXT:    kshiftrq $36, %k5, %k3
+; AVX512F-32-NEXT:    kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %eax, %k3
 ; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    # kill: def %bl killed %bl killed %ebx def %ebx
-; AVX512F-32-NEXT:    shrb $7, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    shrl $16, %eax
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $27, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k5, %k4, %k5
+; AVX512F-32-NEXT:    kshiftrq $37, %k5, %k4
+; AVX512F-32-NEXT:    kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $13, %ecx
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $26, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k5, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $38, %k0, %k5
+; AVX512F-32-NEXT:    kxorq %k7, %k5, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k5
 ; AVX512F-32-NEXT:    movl %eax, %edx
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $25, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $39, %k7, %k0
+; AVX512F-32-NEXT:    kxorq %k6, %k0, %k6
+; AVX512F-32-NEXT:    kmovd %edx, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp) # 8-byte Spill
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    andb $15, %cl
+; AVX512F-32-NEXT:    movl %ecx, %edx
+; AVX512F-32-NEXT:    shrb $2, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $24, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $40, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $12, %ecx
+; AVX512F-32-NEXT:    andl $15, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $23, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $41, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $14, %ecx
+; AVX512F-32-NEXT:    andl $3, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $22, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k6, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $42, %k2, %k6
+; AVX512F-32-NEXT:    kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $15, %ecx
+; AVX512F-32-NEXT:    andl $1, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $21, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k2, %k3, %k2
+; AVX512F-32-NEXT:    kshiftrq $43, %k2, %k3
+; AVX512F-32-NEXT:    kxorq %k4, %k3, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k6
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $20, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k2, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $44, %k3, %k2
+; AVX512F-32-NEXT:    kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k2
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $19, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k3, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $45, %k0, %k3
+; AVX512F-32-NEXT:    kxorq %k5, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k3
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $18, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT:    kshiftrq $46, %k0, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k5
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $6, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $17, %k5, %k5
+; AVX512F-32-NEXT:    kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT:    kshiftrq $47, %k0, %k5
+; AVX512F-32-NEXT:    kxorq %k6, %k5, %k5
+; AVX512F-32-NEXT:    kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $16, %k5, %k5
+; AVX512F-32-NEXT:    kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT:    kshiftrq $48, %k0, %k5
+; AVX512F-32-NEXT:    kmovd %eax, %k6
+; AVX512F-32-NEXT:    kxorq %k6, %k5, %k6
+; AVX512F-32-NEXT:    kmovd %ecx, %k5
+; AVX512F-32-NEXT:    movl %ebx, %edx
+; AVX512F-32-NEXT:    shrl $24, %edx
+; AVX512F-32-NEXT:    # kill: def %al killed %al killed %eax def %eax
+; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $15, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $49, %k6, %k0
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k7 # 8-byte Reload
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $14, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $50, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k7
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    # kill: def %dl killed %dl killed %edx def %edx
 ; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $13, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $51, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k2, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k2
 ; AVX512F-32-NEXT:    movl %edx, %eax
 ; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $12, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $52, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %edx, %k3
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
-; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $11, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $53, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k4, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k4
+; AVX512F-32-NEXT:    movl %ebx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $10, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $54, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k5
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $9, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $55, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $8, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k6, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $56, %k0, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $7, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $57, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $6, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k3, %k1, %k1
+; AVX512F-32-NEXT:    movl %ebx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $5, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $59, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k4, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $4, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512F-32-NEXT:    kmovd %eax, %k2
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    movl %ebx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $30, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $3, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $61, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k5, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $2, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $1, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kshiftrq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpleb %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    adcl %ecx, %edx
-; AVX512F-32-NEXT:    addl $60, %esp
+; AVX512F-32-NEXT:    adcl %ebx, %edx
+; AVX512F-32-NEXT:    addl $68, %esp
 ; AVX512F-32-NEXT:    popl %esi
 ; AVX512F-32-NEXT:    popl %ebx
 ; AVX512F-32-NEXT:    vzeroupper
@@ -2679,753 +2500,574 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512F-32-NEXT:    pushl %esi
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    subl $60, %esp
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 80
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ecx, %edx
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k0
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    shrl $16, %eax
 ; AVX512F-32-NEXT:    movl %ebx, %edx
-; AVX512F-32-NEXT:    shrb $2, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    shrb $4, %bl
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    kmovd %ecx, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    kmovd %ebx, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %ch, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %ch, %dl
 ; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    andb $2, %cl
+; AVX512F-32-NEXT:    shrb %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %edx, %ecx
 ; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $12, %eax
-; AVX512F-32-NEXT:    andl $15, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $14, %eax
-; AVX512F-32-NEXT:    andl $3, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $15, %eax
-; AVX512F-32-NEXT:    andl $1, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $16, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k2
+; AVX512F-32-NEXT:    movb %bh, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k3
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $6, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k6
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $7, %cl
+; AVX512F-32-NEXT:    kmovd %ebx, %k5
+; AVX512F-32-NEXT:    kshiftrq $1, %k5, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $62, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k5, %k1, %k7
+; AVX512F-32-NEXT:    kshiftrq $2, %k7, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k5
+; AVX512F-32-NEXT:    movb %bh, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %ebx, %esi
+; AVX512F-32-NEXT:    andb $2, %cl
+; AVX512F-32-NEXT:    shrb %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $61, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT:    kshiftrq $3, %k7, %k2
+; AVX512F-32-NEXT:    kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    movl %edx, %ecx
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $60, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $4, %k0, %k7
+; AVX512F-32-NEXT:    kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %edx, %k3
 ; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $59, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $5, %k7, %k0
+; AVX512F-32-NEXT:    kxorq %k4, %k0, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $13, %ecx
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $58, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT:    kshiftrq $6, %k7, %k4
+; AVX512F-32-NEXT:    kxorq %k6, %k4, %k6
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
 ; AVX512F-32-NEXT:    movl %eax, %ebx
-; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ebx, %edx
-; AVX512F-32-NEXT:    shrb $2, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    shrb $4, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    shrb $5, %dl
-; AVX512F-32-NEXT:    andb $1, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    andb $2, %bl
+; AVX512F-32-NEXT:    shrb %bl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $57, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $7, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %ebx, %k5
+; AVX512F-32-NEXT:    movl %edx, %ecx
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $56, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $8, %k7, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT:    kmovd %edx, %k6
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $55, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT:    kshiftrq $9, %k7, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $54, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT:    kshiftrq $10, %k7, %k2
+; AVX512F-32-NEXT:    kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    movl %esi, %edx
+; AVX512F-32-NEXT:    shrl $12, %edx
+; AVX512F-32-NEXT:    andl $15, %edx
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $53, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k7, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $11, %k3, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $52, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k3, %k0, %k3
+; AVX512F-32-NEXT:    kshiftrq $12, %k3, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    movl %eax, %edx
 ; AVX512F-32-NEXT:    shrb $6, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $14, %ecx
+; AVX512F-32-NEXT:    andl $3, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $51, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k3, %k7, %k3
+; AVX512F-32-NEXT:    kshiftrq $13, %k3, %k7
+; AVX512F-32-NEXT:    kxorq %k4, %k7, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $15, %ecx
+; AVX512F-32-NEXT:    andl $1, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $50, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $14, %k3, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $49, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $15, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $48, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $16, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %eax, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    movl %esi, %edx
+; AVX512F-32-NEXT:    shrl $24, %edx
 ; AVX512F-32-NEXT:    # kill: def %al killed %al killed %eax def %eax
 ; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    andb $2, %dl
-; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %edx
-; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $47, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT:    kshiftrq $17, %k3, %k4
+; AVX512F-32-NEXT:    kxorq %k5, %k4, %k4
+; AVX512F-32-NEXT:    kmovd %eax, %k5
 ; AVX512F-32-NEXT:    movl %edx, %eax
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    movl %ecx, %esi
-; AVX512F-32-NEXT:    shrl $29, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %esi, %eax
-; AVX512F-32-NEXT:    shrl $31, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %edx
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $46, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k3, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $18, %k4, %k3
+; AVX512F-32-NEXT:    kxorq %k6, %k3, %k6
+; AVX512F-32-NEXT:    kmovd %edx, %k3
+; AVX512F-32-NEXT:    # kill: def %dl killed %dl killed %edx def %edx
 ; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movl %edx, %eax
-; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $7, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movb %ch, %dl
-; AVX512F-32-NEXT:    andb $15, %dl
-; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $45, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k4, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $19, %k6, %k4
+; AVX512F-32-NEXT:    kxorq %k1, %k4, %k1
+; AVX512F-32-NEXT:    kmovd %eax, %k4
+; AVX512F-32-NEXT:    movl %edx, %ecx
 ; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $12, %eax
-; AVX512F-32-NEXT:    andl $15, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $14, %eax
-; AVX512F-32-NEXT:    andl $3, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $15, %eax
-; AVX512F-32-NEXT:    andl $1, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %ebx
-; AVX512F-32-NEXT:    shrl $16, %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $44, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k6, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $20, %k1, %k6
+; AVX512F-32-NEXT:    kxorq %k2, %k6, %k6
+; AVX512F-32-NEXT:    kmovd %edx, %k2
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    andb $15, %al
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $43, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT:    kshiftrq $21, %k1, %k6
+; AVX512F-32-NEXT:    kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $29, %ecx
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $42, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $22, %k6, %k1
+; AVX512F-32-NEXT:    kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
 ; AVX512F-32-NEXT:    movl %ebx, %edx
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $41, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $23, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %edx, %k5
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $2, %al
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $40, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $24, %k7, %k6
+; AVX512F-32-NEXT:    kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT:    kmovd %eax, %k6
+; AVX512F-32-NEXT:    movb %bh, %al
 ; AVX512F-32-NEXT:    andb $15, %al
-; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $39, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k7, %k3, %k7
+; AVX512F-32-NEXT:    kshiftrq $25, %k7, %k3
+; AVX512F-32-NEXT:    kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k3
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $38, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT:    kshiftrq $26, %k7, %k4
+; AVX512F-32-NEXT:    kxorq %k2, %k4, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    movl %esi, %edx
+; AVX512F-32-NEXT:    shrl $28, %edx
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $37, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $27, %k2, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $36, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k2, %k0, %k2
+; AVX512F-32-NEXT:    kshiftrq $28, %k2, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %ebx, %edx
+; AVX512F-32-NEXT:    shrb $6, %dl
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $30, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $35, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT:    kshiftrq $29, %k2, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    movl %esi, %ecx
+; AVX512F-32-NEXT:    shrl $31, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $34, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $30, %k1, %k2
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $33, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT:    kshiftrq $31, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512F-32-NEXT:    kmovd %ebx, %k7
+; AVX512F-32-NEXT:    kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT:    kmovd %edx, %k7
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrb $7, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $31, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT:    kshiftrq $33, %k1, %k2
+; AVX512F-32-NEXT:    kxorq %k5, %k2, %k2
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $30, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k1, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $34, %k2, %k1
+; AVX512F-32-NEXT:    kxorq %k6, %k1, %k5
+; AVX512F-32-NEXT:    kmovd %ecx, %k6
+; AVX512F-32-NEXT:    movb %bh, %cl
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    andb $2, %cl
+; AVX512F-32-NEXT:    shrb %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $29, %k5, %k5
+; AVX512F-32-NEXT:    kxorq %k2, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $35, %k5, %k2
+; AVX512F-32-NEXT:    kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    movl %eax, %ecx
 ; AVX512F-32-NEXT:    shrb $2, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $28, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k5, %k3, %k5
+; AVX512F-32-NEXT:    kshiftrq $36, %k5, %k3
+; AVX512F-32-NEXT:    kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %eax, %k3
 ; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    # kill: def %bl killed %bl killed %ebx def %ebx
-; AVX512F-32-NEXT:    shrb $7, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    shrl $16, %eax
+; AVX512F-32-NEXT:    shrb $3, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $27, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k5, %k4, %k5
+; AVX512F-32-NEXT:    kshiftrq $37, %k5, %k4
+; AVX512F-32-NEXT:    kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $13, %ecx
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $26, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k5, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $38, %k0, %k5
+; AVX512F-32-NEXT:    kxorq %k7, %k5, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k5
 ; AVX512F-32-NEXT:    movl %eax, %edx
 ; AVX512F-32-NEXT:    andb $2, %dl
 ; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $25, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $39, %k7, %k0
+; AVX512F-32-NEXT:    kxorq %k6, %k0, %k6
+; AVX512F-32-NEXT:    kmovd %edx, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp) # 8-byte Spill
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    andb $15, %cl
+; AVX512F-32-NEXT:    movl %ecx, %edx
+; AVX512F-32-NEXT:    shrb $2, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $24, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $40, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $12, %ecx
+; AVX512F-32-NEXT:    andl $15, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $23, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $41, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT:    kmovd %ecx, %k0
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $14, %ecx
+; AVX512F-32-NEXT:    andl $3, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $22, %k2, %k2
+; AVX512F-32-NEXT:    kxorq %k6, %k2, %k2
+; AVX512F-32-NEXT:    kshiftrq $42, %k2, %k6
+; AVX512F-32-NEXT:    kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k7
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $15, %ecx
+; AVX512F-32-NEXT:    andl $1, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $21, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k2, %k3, %k2
+; AVX512F-32-NEXT:    kshiftrq $43, %k2, %k3
+; AVX512F-32-NEXT:    kxorq %k4, %k3, %k3
+; AVX512F-32-NEXT:    kmovd %ecx, %k6
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $20, %k3, %k3
+; AVX512F-32-NEXT:    kxorq %k2, %k3, %k3
+; AVX512F-32-NEXT:    kshiftrq $44, %k3, %k2
+; AVX512F-32-NEXT:    kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT:    kmovd %edx, %k2
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $4, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $19, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k3, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $45, %k0, %k3
+; AVX512F-32-NEXT:    kxorq %k5, %k3, %k4
+; AVX512F-32-NEXT:    kmovd %ecx, %k3
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $5, %cl
+; AVX512F-32-NEXT:    andb $1, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT:    kshiftrq $18, %k4, %k4
+; AVX512F-32-NEXT:    kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT:    kshiftrq $46, %k0, %k4
+; AVX512F-32-NEXT:    kxorq %k7, %k4, %k5
+; AVX512F-32-NEXT:    kmovd %ecx, %k4
+; AVX512F-32-NEXT:    movl %eax, %ecx
+; AVX512F-32-NEXT:    shrb $6, %cl
+; AVX512F-32-NEXT:    kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $17, %k5, %k5
+; AVX512F-32-NEXT:    kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT:    kshiftrq $47, %k0, %k5
+; AVX512F-32-NEXT:    kxorq %k6, %k5, %k5
+; AVX512F-32-NEXT:    kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT:    kshiftrq $16, %k5, %k5
+; AVX512F-32-NEXT:    kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT:    kshiftrq $48, %k0, %k5
+; AVX512F-32-NEXT:    kmovd %eax, %k6
+; AVX512F-32-NEXT:    kxorq %k6, %k5, %k6
+; AVX512F-32-NEXT:    kmovd %ecx, %k5
+; AVX512F-32-NEXT:    movl %ebx, %edx
+; AVX512F-32-NEXT:    shrl $24, %edx
+; AVX512F-32-NEXT:    # kill: def %al killed %al killed %eax def %eax
+; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $15, %k6, %k6
+; AVX512F-32-NEXT:    kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT:    kshiftrq $49, %k6, %k0
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k7 # 8-byte Reload
+; AVX512F-32-NEXT:    kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $14, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $50, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k1, %k7, %k7
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    # kill: def %dl killed %dl killed %edx def %edx
 ; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $13, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $51, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k2, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k2
 ; AVX512F-32-NEXT:    movl %edx, %eax
 ; AVX512F-32-NEXT:    shrb $2, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $12, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $52, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %edx, %k3
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
-; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $11, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $53, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k4, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k4
+; AVX512F-32-NEXT:    movl %ebx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $10, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $54, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT:    kmovd %eax, %k5
+; AVX512F-32-NEXT:    kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT:    kshiftrq $9, %k7, %k7
+; AVX512F-32-NEXT:    kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT:    kshiftrq $55, %k6, %k7
+; AVX512F-32-NEXT:    kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT:    kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $8, %k0, %k0
+; AVX512F-32-NEXT:    kxorq %k6, %k0, %k0
+; AVX512F-32-NEXT:    kshiftrq $56, %k0, %k6
+; AVX512F-32-NEXT:    kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $7, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $57, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $6, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k3, %k1, %k1
+; AVX512F-32-NEXT:    movl %ebx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $5, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $59, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k4, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $4, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512F-32-NEXT:    kmovd %eax, %k2
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    movl %ebx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    movl %ebx, %ecx
+; AVX512F-32-NEXT:    shrl $30, %ecx
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $3, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $61, %k0, %k1
+; AVX512F-32-NEXT:    kxorq %k5, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $2, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512F-32-NEXT:    kmovd %ecx, %k2
+; AVX512F-32-NEXT:    kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT:    kshiftrq $1, %k1, %k1
+; AVX512F-32-NEXT:    kxorq %k0, %k1, %k0
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kshiftrq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT:    vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT:    vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    adcl %ecx, %edx
-; AVX512F-32-NEXT:    addl $60, %esp
+; AVX512F-32-NEXT:    adcl %ebx, %edx
+; AVX512F-32-NEXT:    addl $68, %esp
 ; AVX512F-32-NEXT:    popl %esi
 ; AVX512F-32-NEXT:    popl %ebx
 ; AVX512F-32-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=320120&r1=320119&r2=320120&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Thu Dec  7 16:16:09 2017
@@ -4207,39 +4207,35 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4262,39 +4258,35 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4337,39 +4329,35 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4414,39 +4402,35 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4475,39 +4459,35 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4552,39 +4532,35 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4613,39 +4589,35 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4668,39 +4640,35 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4743,39 +4711,35 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4820,39 +4784,35 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4881,39 +4841,35 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4958,39 +4914,35 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -8521,23 +8473,21 @@ define zeroext i8 @test_vpcmpeqq_v2i1_v8
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8560,23 +8510,21 @@ define zeroext i8 @test_vpcmpeqq_v2i1_v8
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8611,23 +8559,21 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8664,23 +8610,21 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8709,23 +8653,21 @@ define zeroext i8 @test_vpcmpeqq_v2i1_v8
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8762,23 +8704,21 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8807,23 +8747,21 @@ define zeroext i16 @test_vpcmpeqq_v2i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8846,23 +8784,21 @@ define zeroext i16 @test_vpcmpeqq_v2i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8897,23 +8833,21 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8950,23 +8884,21 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8995,23 +8927,21 @@ define zeroext i16 @test_vpcmpeqq_v2i1_v
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9048,23 +8978,21 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9727,36 +9655,33 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9784,36 +9709,33 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9861,36 +9783,33 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9940,36 +9859,33 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10003,36 +9919,33 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10082,36 +9995,33 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10145,36 +10055,33 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10202,36 +10109,33 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10279,36 +10183,33 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10358,36 +10259,33 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10421,36 +10319,33 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10500,36 +10395,33 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -16550,39 +16442,35 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -16605,39 +16493,35 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -16680,39 +16564,35 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -16757,39 +16637,35 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -16818,39 +16694,35 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -16895,39 +16767,35 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -16956,39 +16824,35 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17011,39 +16875,35 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17086,39 +16946,35 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17163,39 +17019,35 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17224,39 +17076,35 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17301,39 +17149,35 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -20864,23 +20708,21 @@ define zeroext i8 @test_vpcmpsgtq_v2i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -20903,23 +20745,21 @@ define zeroext i8 @test_vpcmpsgtq_v2i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -20954,23 +20794,21 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21007,23 +20845,21 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21052,23 +20888,21 @@ define zeroext i8 @test_vpcmpsgtq_v2i1_v
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21105,23 +20939,21 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21150,23 +20982,21 @@ define zeroext i16 @test_vpcmpsgtq_v2i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21189,23 +21019,21 @@ define zeroext i16 @test_vpcmpsgtq_v2i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21240,23 +21068,21 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21293,23 +21119,21 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21338,23 +21162,21 @@ define zeroext i16 @test_vpcmpsgtq_v2i1_
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21391,23 +21213,21 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22070,36 +21890,33 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22127,36 +21944,33 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22204,36 +22018,33 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22283,36 +22094,33 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22346,36 +22154,33 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22425,36 +22230,33 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22488,36 +22290,33 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22545,36 +22344,33 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22622,36 +22418,33 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22701,36 +22494,33 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22764,36 +22554,33 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -22843,36 +22630,33 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -28991,39 +28775,35 @@ define zeroext i8 @test_vpcmpsged_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29049,39 +28829,35 @@ define zeroext i8 @test_vpcmpsged_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29124,39 +28900,35 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29202,39 +28974,35 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29265,39 +29033,35 @@ define zeroext i8 @test_vpcmpsged_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29342,39 +29106,35 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29405,39 +29165,35 @@ define zeroext i16 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29463,39 +29219,35 @@ define zeroext i16 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29538,39 +29290,35 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29616,39 +29364,35 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29679,39 +29423,35 @@ define zeroext i16 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -29756,39 +29496,35 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -33345,23 +33081,21 @@ define zeroext i8 @test_vpcmpsgeq_v2i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33387,23 +33121,21 @@ define zeroext i8 @test_vpcmpsgeq_v2i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33438,23 +33170,21 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33492,23 +33222,21 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33539,23 +33267,21 @@ define zeroext i8 @test_vpcmpsgeq_v2i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33592,23 +33318,21 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33639,23 +33363,21 @@ define zeroext i16 @test_vpcmpsgeq_v2i1_
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33681,23 +33403,21 @@ define zeroext i16 @test_vpcmpsgeq_v2i1_
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33732,23 +33452,21 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33786,23 +33504,21 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33833,23 +33549,21 @@ define zeroext i16 @test_vpcmpsgeq_v2i1_
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -33886,23 +33600,21 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34583,36 +34295,33 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -34643,36 +34352,33 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -34722,36 +34428,33 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -34804,36 +34507,33 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -34869,36 +34569,33 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -34950,36 +34647,33 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -35015,36 +34709,33 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -35075,36 +34766,33 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -35154,36 +34842,33 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -35236,36 +34921,33 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -35301,36 +34983,33 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -35382,36 +35061,33 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -41579,39 +41255,35 @@ define zeroext i8 @test_vpcmpultd_v4i1_v
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -41637,39 +41309,35 @@ define zeroext i8 @test_vpcmpultd_v4i1_v
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -41715,39 +41383,35 @@ define zeroext i8 @test_masked_vpcmpultd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -41795,39 +41459,35 @@ define zeroext i8 @test_masked_vpcmpultd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -41859,39 +41519,35 @@ define zeroext i8 @test_vpcmpultd_v4i1_v
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -41939,39 +41595,35 @@ define zeroext i8 @test_masked_vpcmpultd
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -42003,39 +41655,35 @@ define zeroext i16 @test_vpcmpultd_v4i1_
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -42061,39 +41709,35 @@ define zeroext i16 @test_vpcmpultd_v4i1_
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -42139,39 +41783,35 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -42219,39 +41859,35 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -42283,39 +41919,35 @@ define zeroext i16 @test_vpcmpultd_v4i1_
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -42363,39 +41995,35 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -45983,23 +45611,21 @@ define zeroext i8 @test_vpcmpultq_v2i1_v
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46025,23 +45651,21 @@ define zeroext i8 @test_vpcmpultq_v2i1_v
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46079,23 +45703,21 @@ define zeroext i8 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46135,23 +45757,21 @@ define zeroext i8 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46183,23 +45803,21 @@ define zeroext i8 @test_vpcmpultq_v2i1_v
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46239,23 +45857,21 @@ define zeroext i8 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46287,23 +45903,21 @@ define zeroext i16 @test_vpcmpultq_v2i1_
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46329,23 +45943,21 @@ define zeroext i16 @test_vpcmpultq_v2i1_
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46383,23 +45995,21 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46439,23 +46049,21 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46487,23 +46095,21 @@ define zeroext i16 @test_vpcmpultq_v2i1_
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -46543,23 +46149,21 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47261,36 +46865,33 @@ define zeroext i8 @test_vpcmpultq_v4i1_v
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47321,36 +46922,33 @@ define zeroext i8 @test_vpcmpultq_v4i1_v
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47401,36 +46999,33 @@ define zeroext i8 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47483,36 +47078,33 @@ define zeroext i8 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47549,36 +47141,33 @@ define zeroext i8 @test_vpcmpultq_v4i1_v
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47631,36 +47220,33 @@ define zeroext i8 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47697,36 +47283,33 @@ define zeroext i16 @test_vpcmpultq_v4i1_
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47757,36 +47340,33 @@ define zeroext i16 @test_vpcmpultq_v4i1_
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47837,36 +47417,33 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47919,36 +47496,33 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -47985,36 +47559,33 @@ define zeroext i16 @test_vpcmpultq_v4i1_
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -48067,36 +47638,33 @@ define zeroext i16 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -49961,39 +49529,35 @@ define zeroext i8 @test_vcmpoeqps_v4i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
@@ -50016,39 +49580,35 @@ define zeroext i8 @test_vcmpoeqps_v4i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
@@ -50073,39 +49633,35 @@ define zeroext i8 @test_vcmpoeqps_v4i1_v
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
@@ -50137,36 +49693,33 @@ define zeroext i8 @test_masked_vcmpoeqps
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -50201,36 +49754,33 @@ define zeroext i8 @test_masked_vcmpoeqps
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -50267,36 +49817,33 @@ define zeroext i8 @test_masked_vcmpoeqps
 ; NoVLX-NEXT:    vcmpeqps %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -50328,39 +49875,35 @@ define zeroext i16 @test_vcmpoeqps_v4i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
@@ -50383,39 +49926,35 @@ define zeroext i16 @test_vcmpoeqps_v4i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
@@ -50440,39 +49979,35 @@ define zeroext i16 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
@@ -50504,36 +50039,33 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -50568,36 +50100,33 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -50634,36 +50163,33 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vcmpeqps %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -54198,23 +53724,21 @@ define zeroext i8 @test_vcmpoeqpd_v2i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54237,23 +53761,21 @@ define zeroext i8 @test_vcmpoeqpd_v2i1_v
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54278,23 +53800,21 @@ define zeroext i8 @test_vcmpoeqpd_v2i1_v
 ; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54326,20 +53846,19 @@ define zeroext i8 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -54374,20 +53893,19 @@ define zeroext i8 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -54424,20 +53942,19 @@ define zeroext i8 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    vcmpeqpd %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -54469,23 +53986,21 @@ define zeroext i16 @test_vcmpoeqpd_v2i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54508,23 +54023,21 @@ define zeroext i16 @test_vcmpoeqpd_v2i1_
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54549,23 +54062,21 @@ define zeroext i16 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
-; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54597,20 +54108,19 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -54645,20 +54155,19 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -54695,20 +54204,19 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vcmpeqpd %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55344,36 +54852,33 @@ define zeroext i8 @test_vcmpoeqpd_v4i1_v
 ; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55401,36 +54906,33 @@ define zeroext i8 @test_vcmpoeqpd_v4i1_v
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55460,36 +54962,33 @@ define zeroext i8 @test_vcmpoeqpd_v4i1_v
 ; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55526,36 +55025,33 @@ define zeroext i8 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55592,36 +55088,33 @@ define zeroext i8 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55660,36 +55153,33 @@ define zeroext i8 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %al killed %al killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55723,36 +55213,33 @@ define zeroext i16 @test_vcmpoeqpd_v4i1_
 ; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55780,36 +55267,33 @@ define zeroext i16 @test_vcmpoeqpd_v4i1_
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55839,36 +55323,33 @@ define zeroext i16 @test_vcmpoeqpd_v4i1_
 ; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55905,36 +55386,33 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -55971,36 +55449,33 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -56039,36 +55514,33 @@ define zeroext i16 @test_masked_vcmpoeqp
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    kmovw %eax, %k0
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
 ; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $1, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $1, %k2, %k2
+; NoVLX-NEXT:    korw %k1, %k2, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k2
+; NoVLX-NEXT:    kxorw %k0, %k2, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kxorw %k1, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $13, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
+; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k2
+; NoVLX-NEXT:    kxorw %k2, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftrw $12, %k1, %k1
+; NoVLX-NEXT:    kxorw %k0, %k1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def %ax killed %ax killed %eax
 ; NoVLX-NEXT:    vzeroupper




More information about the llvm-commits mailing list