[llvm] r323605 - [X86] Use vptestm/vptestnm for comparisons with zero to avoid creating a zero vector.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Jan 27 12:19:09 PST 2018


Author: ctopper
Date: Sat Jan 27 12:19:09 2018
New Revision: 323605

URL: http://llvm.org/viewvc/llvm-project?rev=323605&view=rev
Log:
[X86] Use vptestm/vptestnm for comparisons with zero to avoid creating a zero vector.

We can use the same input for both operands to get a free compare with zero.

We already use this trick in a couple places where we explicitly create PTESTM with the same input twice. This generalizes it.

I'm hoping to remove the ISD opcodes and move this to isel patterns like we do for scalar cmp/test.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-arith.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll
    llvm/trunk/test/CodeGen/X86/avx512-mov.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll
    llvm/trunk/test/CodeGen/X86/avx512-vbroadcast.ll
    llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-arith.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-mov.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll
    llvm/trunk/test/CodeGen/X86/compress_expand.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/CodeGen/X86/masked_memop.ll
    llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll
    llvm/trunk/test/CodeGen/X86/pr35272.ll
    llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
    llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Jan 27 12:19:09 2018
@@ -17777,6 +17777,13 @@ static SDValue LowerIntVSETCC_AVX512(SDV
       return DAG.getNode(SSECC == 0 ? X86ISD::TESTNM : X86ISD::TESTM,
                          dl, VT, RHS, LHS);
     }
+
+    // If this is just a comparison with 0 without an AND, we can just use
+    // the same input twice to avoid creating a zero vector.
+    if (ISD::isBuildVectorAllZeros(Op1.getNode())) {
+      return DAG.getNode(SSECC == 0 ? X86ISD::TESTNM : X86ISD::TESTM,
+                         dl, VT, Op0, Op0);
+    }
   }
 
   unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU

Modified: llvm/trunk/test/CodeGen/X86/avx512-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-arith.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-arith.ll Sat Jan 27 12:19:09 2018
@@ -401,8 +401,7 @@ define <16 x i32> @vpaddd_broadcast_test
 define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd_mask_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -414,8 +413,7 @@ define <16 x i32> @vpaddd_mask_test(<16
 define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd_maskz_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -427,8 +425,7 @@ define <16 x i32> @vpaddd_maskz_test(<16
 define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd_mask_fold_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -441,8 +438,7 @@ define <16 x i32> @vpaddd_mask_fold_test
 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd_mask_broadcast_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -454,8 +450,7 @@ define <16 x i32> @vpaddd_mask_broadcast
 define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd_maskz_fold_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -468,8 +463,7 @@ define <16 x i32> @vpaddd_maskz_fold_tes
 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd_maskz_broadcast_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -671,8 +665,7 @@ entry:
 define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
 ; CHECK-LABEL: test_mask_vaddps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -686,8 +679,7 @@ define <16 x float> @test_mask_vaddps(<1
 define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
 ; CHECK-LABEL: test_mask_vmulps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -701,8 +693,7 @@ define <16 x float> @test_mask_vmulps(<1
 define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
 ; CHECK-LABEL: test_mask_vminps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -718,38 +709,33 @@ define <8 x double> @test_mask_vminpd(<8
 ; AVX512F-LABEL: test_mask_vminpd:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; AVX512F-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_mask_vminpd:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
 ; AVX512VL-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_mask_vminpd:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; AVX512BW-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_mask_vminpd:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512DQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; AVX512DQ-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    retq
 ;
 ; SKX-LABEL: test_mask_vminpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1
 ; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
@@ -764,8 +750,7 @@ define <8 x double> @test_mask_vminpd(<8
 define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
 ; CHECK-LABEL: test_mask_vmaxps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -781,38 +766,33 @@ define <8 x double> @test_mask_vmaxpd(<8
 ; AVX512F-LABEL: test_mask_vmaxpd:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; AVX512F-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_mask_vmaxpd:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
 ; AVX512VL-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_mask_vmaxpd:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; AVX512BW-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_mask_vmaxpd:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    # kill: def %ymm3 killed %ymm3 def %zmm3
-; AVX512DQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; AVX512DQ-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    retq
 ;
 ; SKX-LABEL: test_mask_vmaxpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1
 ; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
@@ -827,8 +807,7 @@ define <8 x double> @test_mask_vmaxpd(<8
 define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
 ; CHECK-LABEL: test_mask_vsubps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -842,8 +821,7 @@ define <16 x float> @test_mask_vsubps(<1
 define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
 ; CHECK-LABEL: test_mask_vdivps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -857,8 +835,7 @@ define <16 x float> @test_mask_vdivps(<1
 define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
 ; CHECK-LABEL: test_mask_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestmq %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <8 x double> %j, <8 x i64> %mask1)
@@ -872,8 +849,7 @@ define <8 x double> @test_mask_vaddpd(<8
 define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
 ; CHECK-LABEL: test_maskz_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind readnone {
@@ -886,8 +862,7 @@ define <8 x double> @test_maskz_vaddpd(<
 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
 ; CHECK-LABEL: test_mask_fold_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                      <8 x double>* %j,  <8 x i64> %mask1)
@@ -902,8 +877,7 @@ define <8 x double> @test_mask_fold_vadd
 define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
 ; CHECK-LABEL: test_maskz_fold_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind {
@@ -930,8 +904,7 @@ define <8 x double> @test_broadcast_vadd
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
 ; CHECK-LABEL: test_mask_broadcast_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -949,8 +922,7 @@ define <8 x double> @test_mask_broadcast
 define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
 ; CHECK-LABEL: test_maskz_broadcast_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
                                        <8 x i64> %mask1) nounwind {

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Sat Jan 27 12:19:09 2018
@@ -2413,8 +2413,7 @@ define void @store_64i1(<64 x i1>* %a, <
 define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
 ; KNL-LABEL: test_bitcast_v8i1_zext:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    addl %eax, %eax
@@ -2423,8 +2422,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 ;
 ; SKX-LABEL: test_bitcast_v8i1_zext:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; SKX-NEXT:    kmovb %k0, %eax
 ; SKX-NEXT:    addl %eax, %eax
 ; SKX-NEXT:    vzeroupper
@@ -2432,8 +2430,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 ;
 ; AVX512BW-LABEL: test_bitcast_v8i1_zext:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    addl %eax, %eax
@@ -2442,8 +2439,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 ;
 ; AVX512DQ-LABEL: test_bitcast_v8i1_zext:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, %eax
 ; AVX512DQ-NEXT:    addl %eax, %eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2459,8 +2455,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
 ; CHECK-LABEL: test_bitcast_v16i1_zext:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addl %eax, %eax
 ; CHECK-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll Sat Jan 27 12:19:09 2018
@@ -5,8 +5,7 @@
 define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
 ; AVX512-LABEL: test1:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
@@ -17,8 +16,7 @@ define <16 x i32> @test1(<16 x i32> %tri
 define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
 ; AVX512-LABEL: test2:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
@@ -29,8 +27,7 @@ define <16 x i32> @test2(<16 x i32> %tri
 define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
 ; AVX512-LABEL: test3:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -42,8 +39,7 @@ define void @test3(<16 x i32> %trigger,
 define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
 ; AVX512-LABEL: test4:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
@@ -54,8 +50,7 @@ define <16 x float> @test4(<16 x i32> %t
 define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
 ; AVX512-LABEL: test13:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vmovups %zmm1, (%rdi) {%k1}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -99,9 +94,8 @@ declare <16 x i32*> @llvm.masked.load.v1
 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
 ; AVX512-LABEL: test23:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512-NEXT:    vpcmpeqq %zmm2, %zmm1, %k2
+; AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vptestnmq %zmm1, %zmm1, %k2
 ; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
 ; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
 ; AVX512-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-mov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mov.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mov.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mov.ll Sat Jan 27 12:19:09 2018
@@ -311,8 +311,7 @@ define <16 x float> @test31(i8 * %addr)
 define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; CHECK-LABEL: test32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -325,8 +324,7 @@ define <16 x i32> @test32(i8 * %addr, <1
 define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; CHECK-LABEL: test33:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -339,8 +337,7 @@ define <16 x i32> @test33(i8 * %addr, <1
 define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
 ; CHECK-LABEL: test34:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -353,8 +350,7 @@ define <16 x i32> @test34(i8 * %addr, <1
 define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
 ; CHECK-LABEL: test35:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -367,8 +363,7 @@ define <16 x i32> @test35(i8 * %addr, <1
 define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
 ; CHECK-LABEL: test36:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -381,8 +376,7 @@ define <8 x i64> @test36(i8 * %addr, <8
 define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
 ; CHECK-LABEL: test37:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -395,8 +389,7 @@ define <8 x i64> @test37(i8 * %addr, <8
 define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
 ; CHECK-LABEL: test38:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -409,8 +402,7 @@ define <8 x i64> @test38(i8 * %addr, <8
 define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
 ; CHECK-LABEL: test39:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Sat Jan 27 12:19:09 2018
@@ -400,15 +400,13 @@ define <16 x i32> @vpaddd_broadcast_test
 define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: vpaddd_mask_test:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpaddd_mask_test:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -420,15 +418,13 @@ define <16 x i32> @vpaddd_mask_test(<16
 define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: vpaddd_maskz_test:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpaddd_maskz_test:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -440,15 +436,13 @@ define <16 x i32> @vpaddd_maskz_test(<16
 define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: vpaddd_mask_fold_test:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpaddd_mask_fold_test:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -461,15 +455,13 @@ define <16 x i32> @vpaddd_mask_fold_test
 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: vpaddd_mask_broadcast_test:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpaddd_mask_broadcast_test:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -481,15 +473,13 @@ define <16 x i32> @vpaddd_mask_broadcast
 define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: vpaddd_maskz_fold_test:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpaddd_maskz_fold_test:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -502,15 +492,13 @@ define <16 x i32> @vpaddd_maskz_fold_tes
 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: vpaddd_maskz_broadcast_test:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpaddd_maskz_broadcast_test:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -717,15 +705,13 @@ entry:
 define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
 ; GENERIC-LABEL: test_mask_vaddps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vaddps:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
                                      <16 x float> %j, <16 x i32> %mask1)
@@ -739,15 +725,13 @@ define <16 x float> @test_mask_vaddps(<1
 define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vmulps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vmulps:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -759,15 +743,13 @@ define <16 x float> @test_mask_vmulps(<1
 define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vminps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vminps:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -780,15 +762,13 @@ define <16 x float> @test_mask_vminps(<1
 define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vminpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vminpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -801,15 +781,13 @@ define <8 x double> @test_mask_vminpd(<8
 define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vmaxps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vmaxps:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -822,15 +800,13 @@ define <16 x float> @test_mask_vmaxps(<1
 define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vmaxpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vmaxpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -843,15 +819,13 @@ define <8 x double> @test_mask_vmaxpd(<8
 define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vsubps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vsubps:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -863,15 +837,13 @@ define <16 x float> @test_mask_vsubps(<1
 define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vdivps:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vdivps:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -883,15 +855,13 @@ define <16 x float> @test_mask_vdivps(<1
 define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_mask_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -903,15 +873,13 @@ define <8 x double> @test_mask_vaddpd(<8
 define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone {
 ; GENERIC-LABEL: test_maskz_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_maskz_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -923,15 +891,13 @@ define <8 x double> @test_maskz_vaddpd(<
 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double>* %j,  <8 x i64> %mask1) nounwind {
 ; GENERIC-LABEL: test_mask_fold_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_fold_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -944,15 +910,13 @@ define <8 x double> @test_mask_fold_vadd
 define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind {
 ; GENERIC-LABEL: test_maskz_fold_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_maskz_fold_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -983,16 +947,14 @@ define <8 x double> @test_broadcast_vadd
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind {
 ; GENERIC-LABEL: test_mask_broadcast_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_broadcast_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1009,15 +971,13 @@ define <8 x double> @test_mask_broadcast
 define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
 ; GENERIC-LABEL: test_maskz_broadcast_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_maskz_broadcast_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
                                        <8 x i64> %mask1) nounwind {
@@ -6383,15 +6343,13 @@ define <16 x float> @mov_test31(i8 * %ad
 define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; GENERIC-LABEL: mov_test32:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6404,15 +6362,13 @@ define <16 x i32> @mov_test32(i8 * %addr
 define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; GENERIC-LABEL: mov_test33:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test33:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6425,15 +6381,13 @@ define <16 x i32> @mov_test33(i8 * %addr
 define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) {
 ; GENERIC-LABEL: mov_test34:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test34:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6446,15 +6400,13 @@ define <16 x i32> @mov_test34(i8 * %addr
 define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) {
 ; GENERIC-LABEL: mov_test35:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test35:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -6467,15 +6419,13 @@ define <16 x i32> @mov_test35(i8 * %addr
 define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
 ; GENERIC-LABEL: mov_test36:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test36:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -6488,15 +6438,13 @@ define <8 x i64> @mov_test36(i8 * %addr,
 define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
 ; GENERIC-LABEL: mov_test37:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test37:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -6509,15 +6457,13 @@ define <8 x i64> @mov_test37(i8 * %addr,
 define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) {
 ; GENERIC-LABEL: mov_test38:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test38:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -6530,15 +6476,13 @@ define <8 x i64> @mov_test38(i8 * %addr,
 define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) {
 ; GENERIC-LABEL: mov_test39:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mov_test39:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -8032,8 +7976,7 @@ define void @store_64i1(<64 x i1>* %a, <
 define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
 ; GENERIC-LABEL: test_bitcast_v8i1_zext:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    kmovb %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    addl %eax, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
@@ -8041,8 +7984,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 ;
 ; SKX-LABEL: test_bitcast_v8i1_zext:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    kmovb %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    addl %eax, %eax # sched: [1:0.25]
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
@@ -8058,8 +8000,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
 ; GENERIC-LABEL: test_bitcast_v16i1_zext:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    kmovw %k0, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    addl %eax, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
@@ -8067,8 +8008,7 @@ define i32 @test_bitcast_v16i1_zext(<16
 ;
 ; SKX-LABEL: test_bitcast_v16i1_zext:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    kmovw %k0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    addl %eax, %eax # sched: [1:0.25]
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
@@ -8292,16 +8232,14 @@ define   <16 x float> @_inreg16xfloat(fl
 define   <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
 ; GENERIC-LABEL: _ss16xfloat_mask:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_mask:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8315,15 +8253,13 @@ define   <16 x float> @_ss16xfloat_mask(
 define   <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
 ; GENERIC-LABEL: _ss16xfloat_maskz:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_maskz:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -8352,15 +8288,13 @@ define   <16 x float> @_ss16xfloat_load(
 define   <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
 ; GENERIC-LABEL: _ss16xfloat_mask_load:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_mask_load:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = load float, float* %a.ptr
@@ -8374,15 +8308,13 @@ define   <16 x float> @_ss16xfloat_mask_
 define   <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
 ; GENERIC-LABEL: _ss16xfloat_maskz_load:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _ss16xfloat_maskz_load:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = load float, float* %a.ptr
@@ -8411,16 +8343,14 @@ define   <8 x double> @_inreg8xdouble(do
 define   <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
 ; GENERIC-LABEL: _sd8xdouble_mask:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_mask:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8434,15 +8364,13 @@ define   <8 x double> @_sd8xdouble_mask(
 define   <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
 ; GENERIC-LABEL: _sd8xdouble_maskz:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_maskz:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -8471,15 +8399,13 @@ define   <8 x double> @_sd8xdouble_load(
 define   <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
 ; GENERIC-LABEL: _sd8xdouble_mask_load:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_mask_load:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = load double, double* %a.ptr
@@ -8493,15 +8419,13 @@ define   <8 x double> @_sd8xdouble_mask_
 define   <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
 ; GENERIC-LABEL: _sd8xdouble_maskz_load:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: _sd8xdouble_maskz_load:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %a = load double, double* %a.ptr

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Sat Jan 27 12:19:09 2018
@@ -23,8 +23,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -32,8 +31,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-LABEL: test_masked_16xi16_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -47,16 +45,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -68,8 +64,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -77,8 +72,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-LABEL: test_masked_16xi16_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -92,16 +86,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -113,8 +105,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -122,8 +113,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-LABEL: test_masked_16xi16_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -137,16 +127,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -173,8 +161,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -182,8 +169,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; SKX-LABEL: test_masked_16xi16_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -197,16 +183,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -234,16 +218,14 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -257,16 +239,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -280,16 +260,14 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -303,16 +281,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -326,16 +302,14 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -349,16 +323,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -388,16 +360,14 @@ define <16 x i16> @test_masked_16xi16_pe
 ; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -411,16 +381,14 @@ define <16 x i16> @test_masked_z_16xi16_
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -449,8 +417,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -458,8 +425,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-LABEL: test_masked_32xi16_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -473,16 +439,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
@@ -494,8 +458,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -503,8 +466,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-LABEL: test_masked_32xi16_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -518,16 +480,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
@@ -539,8 +499,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -548,8 +507,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-LABEL: test_masked_32xi16_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -563,16 +521,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
@@ -599,8 +555,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -608,8 +563,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; SKX-LABEL: test_masked_32xi16_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -623,16 +577,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
@@ -660,16 +612,14 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -683,16 +633,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -706,16 +654,14 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -729,16 +675,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -752,16 +696,14 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -775,16 +717,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -814,16 +754,14 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -837,16 +775,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -875,8 +811,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -884,8 +819,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-LABEL: test_masked_8xi32_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -899,16 +833,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
@@ -920,8 +852,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -929,8 +860,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-LABEL: test_masked_8xi32_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -944,16 +874,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
@@ -965,8 +893,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -974,8 +901,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-LABEL: test_masked_8xi32_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -989,16 +915,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
@@ -1025,8 +949,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1034,8 +957,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; SKX-LABEL: test_masked_8xi32_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1049,16 +971,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
@@ -1086,16 +1006,14 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1109,16 +1027,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1132,16 +1048,14 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1155,16 +1069,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1178,16 +1090,14 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1201,16 +1111,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1240,16 +1148,14 @@ define <8 x i32> @test_masked_8xi32_perm
 ; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1263,16 +1169,14 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1301,8 +1205,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1310,8 +1213,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-LABEL: test_masked_16xi32_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1325,16 +1227,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
@@ -1346,8 +1246,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1355,8 +1254,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-LABEL: test_masked_16xi32_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1370,16 +1268,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
@@ -1391,8 +1287,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1400,8 +1295,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-LABEL: test_masked_16xi32_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1415,16 +1309,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
@@ -1451,8 +1343,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -1460,8 +1351,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; SKX-LABEL: test_masked_16xi32_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1475,16 +1365,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
@@ -1512,16 +1400,14 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1535,16 +1421,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1558,16 +1442,14 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1581,16 +1463,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1604,16 +1484,14 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1627,16 +1505,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1666,16 +1542,14 @@ define <16 x i32> @test_masked_16xi32_pe
 ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1689,16 +1563,14 @@ define <16 x i32> @test_masked_z_16xi32_
 ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1724,16 +1596,14 @@ define <4 x i64> @test_4xi64_perm_mask0(
 define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1746,15 +1616,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -1765,16 +1633,14 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1787,15 +1653,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -1806,16 +1670,14 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1828,15 +1690,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
@@ -1860,16 +1720,14 @@ define <4 x i64> @test_4xi64_perm_mask3(
 define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -1882,15 +1740,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
@@ -1915,15 +1771,13 @@ define <4 x i64> @test_4xi64_perm_mem_ma
 define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1936,15 +1790,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1957,15 +1809,13 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1978,15 +1828,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1999,15 +1847,13 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -2020,15 +1866,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -2055,15 +1899,13 @@ define <4 x i64> @test_4xi64_perm_mem_ma
 define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -2076,15 +1918,13 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -2113,8 +1953,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2122,8 +1961,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-LABEL: test_masked_8xi64_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2137,16 +1975,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
@@ -2157,16 +1993,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2179,15 +2013,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
@@ -2199,8 +2031,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2208,8 +2039,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-LABEL: test_masked_8xi64_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2223,16 +2053,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
@@ -2256,16 +2084,14 @@ define <8 x i64> @test_8xi64_perm_imm_ma
 define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2278,15 +2104,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
@@ -2298,8 +2122,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2307,8 +2130,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-LABEL: test_masked_8xi64_perm_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2322,16 +2144,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
@@ -2342,16 +2162,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2364,15 +2182,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -2399,8 +2215,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2408,8 +2223,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; SKX-LABEL: test_masked_8xi64_perm_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2423,16 +2237,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
@@ -2443,16 +2255,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2465,15 +2275,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
@@ -2501,16 +2309,14 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2524,16 +2330,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2546,15 +2350,13 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2567,15 +2369,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2589,16 +2389,14 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2612,16 +2410,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2648,15 +2444,13 @@ define <8 x i64> @test_8xi64_perm_imm_me
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2669,15 +2463,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2691,16 +2483,14 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2714,16 +2504,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2736,15 +2524,13 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2757,15 +2543,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2795,16 +2579,14 @@ define <8 x i64> @test_masked_8xi64_perm
 ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2818,16 +2600,14 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2840,15 +2620,13 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2861,15 +2639,13 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2898,8 +2674,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2907,8 +2682,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-LABEL: test_masked_8xfloat_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2922,16 +2696,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
@@ -2943,8 +2715,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2952,8 +2723,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-LABEL: test_masked_8xfloat_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -2967,16 +2737,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
@@ -2988,8 +2756,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -2997,8 +2764,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-LABEL: test_masked_8xfloat_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3012,16 +2778,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
@@ -3048,8 +2812,7 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3057,8 +2820,7 @@ define <8 x float> @test_masked_8xfloat_
 ; SKX-LABEL: test_masked_8xfloat_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3072,16 +2834,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
@@ -3109,16 +2869,14 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3132,16 +2890,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3155,16 +2911,14 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3178,16 +2932,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3201,16 +2953,14 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3224,16 +2974,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3263,16 +3011,14 @@ define <8 x float> @test_masked_8xfloat_
 ; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3286,16 +3032,14 @@ define <8 x float> @test_masked_z_8xfloa
 ; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
@@ -3324,8 +3068,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3333,8 +3076,7 @@ define <16 x float> @test_masked_16xfloa
 ; SKX-LABEL: test_masked_16xfloat_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3348,16 +3090,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
@@ -3369,8 +3109,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3378,8 +3117,7 @@ define <16 x float> @test_masked_16xfloa
 ; SKX-LABEL: test_masked_16xfloat_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3393,16 +3131,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
@@ -3414,8 +3150,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3423,8 +3158,7 @@ define <16 x float> @test_masked_16xfloa
 ; SKX-LABEL: test_masked_16xfloat_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3438,16 +3172,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
@@ -3474,8 +3206,7 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -3483,8 +3214,7 @@ define <16 x float> @test_masked_16xfloa
 ; SKX-LABEL: test_masked_16xfloat_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3498,16 +3228,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
@@ -3535,16 +3263,14 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3558,16 +3284,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3581,16 +3305,14 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3604,16 +3326,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3627,16 +3347,14 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3650,16 +3368,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3689,16 +3405,14 @@ define <16 x float> @test_masked_16xfloa
 ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3712,16 +3426,14 @@ define <16 x float> @test_masked_z_16xfl
 ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3747,16 +3459,14 @@ define <4 x double> @test_4xdouble_perm_
 define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3769,15 +3479,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
@@ -3788,16 +3496,14 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3810,15 +3516,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
@@ -3829,16 +3533,14 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3851,15 +3553,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
@@ -3883,16 +3583,14 @@ define <4 x double> @test_4xdouble_perm_
 define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -3905,15 +3603,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
@@ -3938,15 +3634,13 @@ define <4 x double> @test_4xdouble_perm_
 define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -3959,15 +3653,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -3980,15 +3672,13 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -4001,15 +3691,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -4022,15 +3710,13 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -4043,15 +3729,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -4078,15 +3762,13 @@ define <4 x double> @test_4xdouble_perm_
 define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -4099,15 +3781,13 @@ define <4 x double> @test_masked_4xdoubl
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
@@ -4136,8 +3816,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4145,8 +3824,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-LABEL: test_masked_8xdouble_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4160,16 +3838,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
@@ -4180,16 +3856,14 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4202,15 +3876,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
@@ -4222,8 +3894,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4231,8 +3902,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-LABEL: test_masked_8xdouble_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4246,16 +3916,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
@@ -4279,16 +3947,14 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4301,15 +3967,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
@@ -4321,8 +3985,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4330,8 +3993,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-LABEL: test_masked_8xdouble_perm_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4345,16 +4007,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
@@ -4365,16 +4025,14 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4387,15 +4045,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
@@ -4422,8 +4078,7 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -4431,8 +4086,7 @@ define <8 x double> @test_masked_8xdoubl
 ; SKX-LABEL: test_masked_8xdouble_perm_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4446,16 +4100,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
@@ -4466,16 +4118,14 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4488,15 +4138,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
@@ -4524,16 +4172,14 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4547,16 +4193,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4569,15 +4213,13 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4590,15 +4232,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4612,16 +4252,14 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4635,16 +4273,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4671,15 +4307,13 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4692,15 +4326,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4714,16 +4346,14 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4737,16 +4367,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4759,15 +4387,13 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4780,15 +4406,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4818,16 +4442,14 @@ define <8 x double> @test_masked_8xdoubl
 ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4841,16 +4463,14 @@ define <8 x double> @test_masked_z_8xdou
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4863,15 +4483,13 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4884,15 +4502,13 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
@@ -4918,16 +4534,14 @@ define <16 x i8> @test_16xi8_perm_mask0(
 define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4940,15 +4554,13 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -4959,16 +4571,14 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -4981,15 +4591,13 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -5000,16 +4608,14 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5022,15 +4628,13 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -5054,16 +4658,14 @@ define <16 x i8> @test_16xi8_perm_mask3(
 define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5076,15 +4678,13 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -5112,16 +4712,14 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5135,16 +4733,14 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5158,16 +4754,14 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5181,16 +4775,14 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5204,16 +4796,14 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5227,16 +4817,14 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5266,16 +4854,14 @@ define <16 x i8> @test_masked_16xi8_perm
 ; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5289,16 +4875,14 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -5324,16 +4908,14 @@ define <32 x i8> @test_32xi8_perm_mask0(
 define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5346,15 +4928,13 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
@@ -5365,16 +4945,14 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5387,15 +4965,13 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
@@ -5406,16 +4982,14 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5428,15 +5002,13 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
@@ -5460,16 +5032,14 @@ define <32 x i8> @test_32xi8_perm_mask3(
 define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5482,15 +5052,13 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
@@ -5518,16 +5086,14 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5541,16 +5107,14 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5564,16 +5128,14 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5587,16 +5149,14 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5610,16 +5170,14 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5633,16 +5191,14 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5672,16 +5228,14 @@ define <32 x i8> @test_masked_32xi8_perm
 ; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5695,16 +5249,14 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -5730,16 +5282,14 @@ define <64 x i8> @test_64xi8_perm_mask0(
 define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5752,15 +5302,13 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
@@ -5771,16 +5319,14 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5793,15 +5339,13 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
@@ -5812,16 +5356,14 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5834,15 +5376,13 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
@@ -5866,16 +5406,14 @@ define <64 x i8> @test_64xi8_perm_mask3(
 define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -5888,15 +5426,13 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
@@ -5924,16 +5460,14 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -5947,16 +5481,14 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -5970,16 +5502,14 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -5993,16 +5523,14 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -6016,16 +5544,14 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -6039,16 +5565,14 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -6078,16 +5602,14 @@ define <64 x i8> @test_masked_64xi8_perm
 ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -6101,16 +5623,14 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -6136,16 +5656,14 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6158,15 +5676,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
@@ -6177,16 +5693,14 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6199,15 +5713,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -6218,16 +5730,14 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6240,15 +5750,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
@@ -6272,16 +5780,14 @@ define <8 x i16> @test_8xi16_perm_low_ma
 define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6294,15 +5800,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -6313,16 +5817,14 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6335,15 +5837,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
@@ -6354,16 +5854,14 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6376,15 +5874,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -6408,16 +5904,14 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6430,15 +5924,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
@@ -6449,16 +5941,14 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6471,15 +5961,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -6504,15 +5992,13 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6525,15 +6011,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6546,15 +6030,13 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6567,15 +6049,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6588,15 +6068,13 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6609,15 +6087,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6644,15 +6120,13 @@ define <8 x i16> @test_8xi16_perm_low_me
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6665,15 +6139,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6686,15 +6158,13 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6707,15 +6177,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6728,15 +6196,13 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6749,15 +6215,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6784,15 +6248,13 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6805,15 +6267,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6826,15 +6286,13 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6847,15 +6305,13 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -6881,16 +6337,14 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6903,15 +6357,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
@@ -6922,16 +6374,14 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6944,15 +6394,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -6963,16 +6411,14 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -6985,15 +6431,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
@@ -7017,16 +6461,14 @@ define <16 x i16> @test_16xi16_perm_low_
 define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7039,15 +6481,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -7058,16 +6498,14 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7080,15 +6518,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
@@ -7099,16 +6535,14 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7121,15 +6555,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -7153,16 +6585,14 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7175,15 +6605,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
@@ -7194,16 +6622,14 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7216,15 +6642,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -7249,15 +6673,13 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7270,15 +6692,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7291,15 +6711,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7312,15 +6730,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7333,15 +6749,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7354,15 +6768,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7389,15 +6801,13 @@ define <16 x i16> @test_16xi16_perm_low_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7410,15 +6820,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7431,15 +6839,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7452,15 +6858,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7473,15 +6877,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7494,15 +6896,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7529,15 +6929,13 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7550,15 +6948,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7571,15 +6967,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7592,15 +6986,13 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -7626,16 +7018,14 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7648,15 +7038,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
@@ -7667,16 +7055,14 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7689,15 +7075,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7708,16 +7092,14 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7730,15 +7112,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
@@ -7762,16 +7142,14 @@ define <32 x i16> @test_32xi16_perm_low_
 define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7784,15 +7162,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -7803,16 +7179,14 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7825,15 +7199,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
@@ -7844,16 +7216,14 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7866,15 +7236,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7898,16 +7266,14 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7920,15 +7286,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
@@ -7939,16 +7303,14 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -7961,15 +7323,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7994,15 +7354,13 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8015,15 +7373,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8036,15 +7392,13 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8057,15 +7411,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8078,15 +7430,13 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8099,15 +7449,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8134,15 +7482,13 @@ define <32 x i16> @test_32xi16_perm_low_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8155,15 +7501,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8176,15 +7520,13 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8197,15 +7539,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8219,16 +7559,14 @@ define <32 x i16> @test_masked_32xi16_pe
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8242,16 +7580,14 @@ define <32 x i16> @test_masked_z_32xi16_
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8278,15 +7614,13 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8299,15 +7633,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8320,15 +7652,13 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8341,15 +7671,13 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -8375,16 +7703,14 @@ define <4 x i32> @test_4xi32_perm_mask0(
 define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8397,15 +7723,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
@@ -8416,16 +7740,14 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8438,15 +7760,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
@@ -8457,16 +7777,14 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8479,15 +7797,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
@@ -8511,16 +7827,14 @@ define <4 x i32> @test_4xi32_perm_mask3(
 define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8533,15 +7847,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
@@ -8566,15 +7878,13 @@ define <4 x i32> @test_4xi32_perm_mem_ma
 define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8587,15 +7897,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8608,15 +7916,13 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8629,15 +7935,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8650,15 +7954,13 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8671,15 +7973,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8706,15 +8006,13 @@ define <4 x i32> @test_4xi32_perm_mem_ma
 define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8727,15 +8025,13 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -8761,16 +8057,14 @@ define <8 x i32> @test2_8xi32_perm_mask0
 define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8783,15 +8077,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
@@ -8802,16 +8094,14 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8824,15 +8114,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
@@ -8843,16 +8131,14 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8865,15 +8151,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
@@ -8897,16 +8181,14 @@ define <8 x i32> @test2_8xi32_perm_mask3
 define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -8919,15 +8201,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
@@ -8952,15 +8232,13 @@ define <8 x i32> @test2_8xi32_perm_mem_m
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -8973,15 +8251,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -8994,15 +8270,13 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -9015,15 +8289,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -9036,15 +8308,13 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -9057,15 +8327,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -9092,15 +8360,13 @@ define <8 x i32> @test2_8xi32_perm_mem_m
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -9113,15 +8379,13 @@ define <8 x i32> @test2_masked_8xi32_per
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -9147,16 +8411,14 @@ define <16 x i32> @test2_16xi32_perm_mas
 define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9169,15 +8431,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
@@ -9188,16 +8448,14 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9210,15 +8468,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
@@ -9229,16 +8485,14 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9251,15 +8505,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
@@ -9283,16 +8535,14 @@ define <16 x i32> @test2_16xi32_perm_mas
 define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9305,15 +8555,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
@@ -9338,15 +8586,13 @@ define <16 x i32> @test2_16xi32_perm_mem
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9359,15 +8605,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9380,15 +8624,13 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9401,15 +8643,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9422,15 +8662,13 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9443,15 +8681,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9478,15 +8714,13 @@ define <16 x i32> @test2_16xi32_perm_mem
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9499,15 +8733,13 @@ define <16 x i32> @test2_masked_16xi32_p
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
 ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -9533,16 +8765,14 @@ define <8 x float> @test2_8xfloat_shuff_
 define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9555,15 +8785,13 @@ define <8 x float> @test2_8xfloat_masked
 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9574,16 +8802,14 @@ define <8 x float> @test2_8xfloat_zero_m
 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9596,15 +8822,13 @@ define <8 x float> @test2_8xfloat_masked
 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9615,16 +8839,14 @@ define <8 x float> @test2_8xfloat_zero_m
 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9637,15 +8859,13 @@ define <8 x float> @test2_8xfloat_masked
 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9669,16 +8889,14 @@ define <8 x float> @test2_8xfloat_shuff_
 define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9691,15 +8909,13 @@ define <8 x float> @test2_8xfloat_masked
 define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9724,16 +8940,14 @@ define <8 x float> @test_8xfloat_shuff_m
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9747,15 +8961,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9768,16 +8980,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9791,15 +9001,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9812,16 +9020,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9835,15 +9041,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9870,16 +9074,14 @@ define <8 x float> @test_8xfloat_shuff_m
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9893,15 +9095,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9927,16 +9127,14 @@ define <16 x float> @test_16xfloat_shuff
 define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9949,15 +9147,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -9968,16 +9164,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -9990,15 +9184,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
@@ -10009,16 +9201,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10031,15 +9221,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -10063,16 +9251,14 @@ define <16 x float> @test_16xfloat_shuff
 define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10085,15 +9271,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
@@ -10118,16 +9302,14 @@ define <16 x float> @test_16xfloat_shuff
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10141,15 +9323,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10162,16 +9342,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10185,15 +9363,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10206,16 +9382,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10229,15 +9403,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10264,16 +9436,14 @@ define <16 x float> @test_16xfloat_shuff
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10287,15 +9457,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10321,16 +9489,14 @@ define <4 x double> @test_4xdouble_shuff
 define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10343,15 +9509,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10362,16 +9526,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10384,15 +9546,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10403,16 +9563,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10425,15 +9583,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10457,16 +9613,14 @@ define <4 x double> @test_4xdouble_shuff
 define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10479,15 +9633,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10512,16 +9664,14 @@ define <4 x double> @test_4xdouble_shuff
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10535,15 +9685,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10556,16 +9704,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10579,15 +9725,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10600,16 +9744,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10623,15 +9765,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10658,16 +9798,14 @@ define <4 x double> @test_4xdouble_shuff
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10681,15 +9819,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10715,16 +9851,14 @@ define <8 x double> @test_8xdouble_shuff
 define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10737,15 +9871,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
@@ -10756,16 +9888,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10778,15 +9908,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
@@ -10797,16 +9925,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10819,15 +9945,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
@@ -10851,16 +9975,14 @@ define <8 x double> @test_8xdouble_shuff
 define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10873,15 +9995,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
@@ -10906,16 +10026,14 @@ define <8 x double> @test_8xdouble_shuff
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10929,15 +10047,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -10950,16 +10066,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -10973,15 +10087,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -10994,16 +10106,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11017,15 +10127,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -11052,16 +10160,14 @@ define <8 x double> @test_8xdouble_shuff
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11075,15 +10181,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -11109,16 +10213,14 @@ define <8 x i32> @test_8xi32_shuff_mask0
 define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11131,15 +10233,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11150,16 +10250,14 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11172,15 +10270,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11191,16 +10287,14 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11213,15 +10307,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11245,16 +10337,14 @@ define <8 x i32> @test_8xi32_shuff_mask3
 define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11267,15 +10357,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11300,16 +10388,14 @@ define <8 x i32> @test_8xi32_shuff_mem_m
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11323,15 +10409,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11344,16 +10428,14 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11367,15 +10449,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11388,16 +10468,14 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11411,15 +10489,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11446,16 +10522,14 @@ define <8 x i32> @test_8xi32_shuff_mem_m
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11469,15 +10543,13 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11503,16 +10575,14 @@ define <16 x i32> @test_16xi32_shuff_mas
 define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11525,15 +10595,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -11544,16 +10612,14 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11566,15 +10632,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -11585,16 +10649,14 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11607,15 +10669,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -11639,16 +10699,14 @@ define <16 x i32> @test_16xi32_shuff_mas
 define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11661,15 +10719,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -11694,16 +10750,14 @@ define <16 x i32> @test_16xi32_shuff_mem
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11717,15 +10771,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11738,16 +10790,14 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11761,15 +10811,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11782,16 +10830,14 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11805,15 +10851,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11840,16 +10884,14 @@ define <16 x i32> @test_16xi32_shuff_mem
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11863,15 +10905,13 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11897,16 +10937,14 @@ define <4 x i64> @test_4xi64_shuff_mask0
 define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11919,15 +10957,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -11938,16 +10974,14 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -11960,15 +10994,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -11979,16 +11011,14 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12001,15 +11031,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12033,16 +11061,14 @@ define <4 x i64> @test_4xi64_shuff_mask3
 define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12055,15 +11081,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12088,16 +11112,14 @@ define <4 x i64> @test_4xi64_shuff_mem_m
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12111,15 +11133,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12132,16 +11152,14 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12155,15 +11173,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12176,16 +11192,14 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12199,15 +11213,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12234,16 +11246,14 @@ define <4 x i64> @test_4xi64_shuff_mem_m
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12257,15 +11267,13 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12291,16 +11299,14 @@ define <8 x i64> @test_8xi64_shuff_mask0
 define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12313,15 +11319,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
@@ -12332,16 +11336,14 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12354,15 +11356,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
@@ -12373,16 +11373,14 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12395,15 +11393,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
@@ -12427,16 +11423,14 @@ define <8 x i64> @test_8xi64_shuff_mask3
 define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12449,15 +11443,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
@@ -12482,16 +11474,14 @@ define <8 x i64> @test_8xi64_shuff_mem_m
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12505,15 +11495,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12526,16 +11514,14 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12549,15 +11535,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12570,16 +11554,14 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12593,15 +11575,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12628,16 +11608,14 @@ define <8 x i64> @test_8xi64_shuff_mem_m
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12651,15 +11629,13 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12685,16 +11661,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12707,15 +11681,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12726,16 +11698,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12748,15 +11718,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12767,16 +11735,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12789,15 +11755,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12821,16 +11785,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12843,15 +11805,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12876,16 +11836,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12899,15 +11857,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -12920,16 +11876,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12943,15 +11897,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -12964,16 +11916,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -12987,15 +11937,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -13022,16 +11970,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13045,15 +11991,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -13079,16 +12023,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13101,15 +12043,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13120,16 +12060,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13142,15 +12080,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13161,16 +12097,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13183,15 +12117,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13215,16 +12147,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13237,15 +12167,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13270,16 +12198,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13293,15 +12219,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13314,16 +12238,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13337,15 +12259,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13358,16 +12278,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13381,15 +12299,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13416,16 +12332,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13439,15 +12353,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13473,16 +12385,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13495,15 +12405,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13514,16 +12422,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13536,15 +12442,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13555,16 +12459,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13577,15 +12479,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13609,16 +12509,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13631,15 +12529,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13664,16 +12560,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13687,15 +12581,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13708,16 +12600,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13731,15 +12621,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13752,16 +12640,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13775,15 +12661,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13810,16 +12694,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13833,15 +12715,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13867,16 +12747,14 @@ define <2 x double> @test_2xdouble_unpac
 define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13889,15 +12767,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -13908,16 +12784,14 @@ define <2 x double> @test_2xdouble_zero_
 define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13930,15 +12804,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -13963,16 +12835,14 @@ define <2 x double> @test_2xdouble_unpac
 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -13986,15 +12856,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -14007,16 +12875,14 @@ define <2 x double> @test_2xdouble_zero_
 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14030,15 +12896,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -14064,16 +12928,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14086,15 +12948,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14105,16 +12965,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14127,15 +12985,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14146,16 +13002,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14168,15 +13022,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14200,16 +13052,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14222,15 +13072,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14255,16 +13103,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14278,15 +13124,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14299,16 +13143,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14322,15 +13164,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14343,16 +13183,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14366,15 +13204,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14401,16 +13237,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14424,15 +13258,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14458,16 +13290,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14480,15 +13310,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14499,16 +13327,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14521,15 +13347,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14540,16 +13364,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14562,15 +13384,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14594,16 +13414,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14616,15 +13434,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14649,16 +13465,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14672,15 +13486,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14693,16 +13505,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14716,15 +13526,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14737,16 +13545,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14760,15 +13566,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14795,16 +13599,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14818,15 +13620,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14852,16 +13652,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14874,15 +13672,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14893,16 +13689,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14915,15 +13709,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14934,16 +13726,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -14956,15 +13746,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14988,16 +13776,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15010,15 +13796,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15043,16 +13827,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15066,15 +13848,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15087,16 +13867,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15110,15 +13888,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15131,16 +13907,14 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15154,15 +13928,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15189,16 +13961,14 @@ define <4 x float> @test_4xfloat_unpack_
 define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15212,15 +13982,13 @@ define <4 x float> @test_4xfloat_masked_
 define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15246,16 +14014,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15268,15 +14034,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15287,16 +14051,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15309,15 +14071,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15328,16 +14088,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15350,15 +14108,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15382,16 +14138,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15404,15 +14158,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15437,16 +14189,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15460,15 +14210,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15481,16 +14229,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15504,15 +14250,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15525,16 +14269,14 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15548,15 +14290,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15583,16 +14323,14 @@ define <8 x float> @test_8xfloat_unpack_
 define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15606,15 +14344,13 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15640,16 +14376,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15662,15 +14396,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15681,16 +14413,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15703,15 +14433,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15722,16 +14450,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15744,15 +14470,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15776,16 +14500,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15798,15 +14520,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15831,16 +14551,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15854,15 +14572,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15875,16 +14591,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15898,15 +14612,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15919,16 +14631,14 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -15942,15 +14652,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15977,16 +14685,14 @@ define <16 x float> @test_16xfloat_unpac
 define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16000,15 +14706,13 @@ define <16 x float> @test_16xfloat_maske
 define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
 ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -16034,16 +14738,14 @@ define <2 x double> @test_2xdouble_unpac
 define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16056,15 +14758,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16075,16 +14775,14 @@ define <2 x double> @test_2xdouble_zero_
 define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16097,15 +14795,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16130,16 +14826,14 @@ define <2 x double> @test_2xdouble_unpac
 define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16153,15 +14847,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -16174,16 +14866,14 @@ define <2 x double> @test_2xdouble_zero_
 define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16197,15 +14887,13 @@ define <2 x double> @test_2xdouble_maske
 define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
 ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -16231,16 +14919,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16253,15 +14939,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16272,16 +14956,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16294,15 +14976,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16313,16 +14993,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16335,15 +15013,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16367,16 +15043,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16389,15 +15063,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16422,16 +15094,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16445,15 +15115,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16466,16 +15134,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16489,15 +15155,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16510,16 +15174,14 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16533,15 +15195,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16568,16 +15228,14 @@ define <4 x double> @test_4xdouble_unpac
 define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16591,15 +15249,13 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
 ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16625,16 +15281,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16647,15 +15301,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16666,16 +15318,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16688,15 +15338,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16707,16 +15355,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16729,15 +15375,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16761,16 +15405,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16783,15 +15425,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16816,16 +15456,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16839,15 +15477,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16860,16 +15496,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16883,15 +15517,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16904,16 +15536,14 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16927,15 +15557,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16962,16 +15590,14 @@ define <8 x double> @test_8xdouble_unpac
 define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -16985,15 +15611,13 @@ define <8 x double> @test_8xdouble_maske
 define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
 ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:1.00]
 ; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll Sat Jan 27 12:19:09 2018
@@ -13,8 +13,7 @@ define <16 x i8> @test_i8_to_16(i8 %s) {
 define <16 x i8> @test_masked_i8_to_16_mask0(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -27,8 +26,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mask0(i8 %s, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -40,8 +38,7 @@ define <16 x i8> @test_masked_z_i8_to_16
 define <16 x i8> @test_masked_i8_to_16_mask1(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -54,8 +51,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mask1(i8 %s, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -67,8 +63,7 @@ define <16 x i8> @test_masked_z_i8_to_16
 define <16 x i8> @test_masked_i8_to_16_mask2(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -81,8 +76,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mask2(i8 %s, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -94,8 +88,7 @@ define <16 x i8> @test_masked_z_i8_to_16
 define <16 x i8> @test_masked_i8_to_16_mask3(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -108,8 +101,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mask3(i8 %s, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -130,8 +122,7 @@ define <32 x i8> @test_i8_to_32(i8 %s) {
 define <32 x i8> @test_masked_i8_to_32_mask0(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -144,8 +135,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mask0(i8 %s, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -157,8 +147,7 @@ define <32 x i8> @test_masked_z_i8_to_32
 define <32 x i8> @test_masked_i8_to_32_mask1(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -171,8 +160,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mask1(i8 %s, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -184,8 +172,7 @@ define <32 x i8> @test_masked_z_i8_to_32
 define <32 x i8> @test_masked_i8_to_32_mask2(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -198,8 +185,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mask2(i8 %s, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -211,8 +197,7 @@ define <32 x i8> @test_masked_z_i8_to_32
 define <32 x i8> @test_masked_i8_to_32_mask3(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -225,8 +210,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mask3(i8 %s, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -247,8 +231,7 @@ define <64 x i8> @test_i8_to_64(i8 %s) {
 define <64 x i8> @test_masked_i8_to_64_mask0(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -261,8 +244,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mask0(i8 %s, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -274,8 +256,7 @@ define <64 x i8> @test_masked_z_i8_to_64
 define <64 x i8> @test_masked_i8_to_64_mask1(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -288,8 +269,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mask1(i8 %s, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -301,8 +281,7 @@ define <64 x i8> @test_masked_z_i8_to_64
 define <64 x i8> @test_masked_i8_to_64_mask2(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -315,8 +294,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mask2(i8 %s, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -328,8 +306,7 @@ define <64 x i8> @test_masked_z_i8_to_64
 define <64 x i8> @test_masked_i8_to_64_mask3(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -342,8 +319,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mask3(i8 %s, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i8> undef, i8 %s, i32 0
@@ -364,8 +340,7 @@ define <8 x i16> @test_i16_to_8(i16 %s)
 define <8 x i16> @test_masked_i16_to_8_mask0(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -378,8 +353,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mask0(i16 %s, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -391,8 +365,7 @@ define <8 x i16> @test_masked_z_i16_to_8
 define <8 x i16> @test_masked_i16_to_8_mask1(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -405,8 +378,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mask1(i16 %s, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -418,8 +390,7 @@ define <8 x i16> @test_masked_z_i16_to_8
 define <8 x i16> @test_masked_i16_to_8_mask2(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -432,8 +403,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mask2(i16 %s, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -445,8 +415,7 @@ define <8 x i16> @test_masked_z_i16_to_8
 define <8 x i16> @test_masked_i16_to_8_mask3(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -459,8 +428,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mask3(i16 %s, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -481,8 +449,7 @@ define <16 x i16> @test_i16_to_16(i16 %s
 define <16 x i16> @test_masked_i16_to_16_mask0(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -495,8 +462,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mask0(i16 %s, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -508,8 +474,7 @@ define <16 x i16> @test_masked_z_i16_to_
 define <16 x i16> @test_masked_i16_to_16_mask1(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -522,8 +487,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mask1(i16 %s, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -535,8 +499,7 @@ define <16 x i16> @test_masked_z_i16_to_
 define <16 x i16> @test_masked_i16_to_16_mask2(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -549,8 +512,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mask2(i16 %s, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -562,8 +524,7 @@ define <16 x i16> @test_masked_z_i16_to_
 define <16 x i16> @test_masked_i16_to_16_mask3(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -576,8 +537,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mask3(i16 %s, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -598,8 +558,7 @@ define <32 x i16> @test_i16_to_32(i16 %s
 define <32 x i16> @test_masked_i16_to_32_mask0(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -612,8 +571,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mask0(i16 %s, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -625,8 +583,7 @@ define <32 x i16> @test_masked_z_i16_to_
 define <32 x i16> @test_masked_i16_to_32_mask1(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -639,8 +596,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mask1(i16 %s, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -652,8 +608,7 @@ define <32 x i16> @test_masked_z_i16_to_
 define <32 x i16> @test_masked_i16_to_32_mask2(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -666,8 +621,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mask2(i16 %s, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -679,8 +633,7 @@ define <32 x i16> @test_masked_z_i16_to_
 define <32 x i16> @test_masked_i16_to_32_mask3(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -693,8 +646,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mask3(i16 %s, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i16> undef, i16 %s, i32 0
@@ -715,8 +667,7 @@ define <4 x i32> @test_i32_to_4(i32 %s)
 define <4 x i32> @test_masked_i32_to_4_mask0(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -729,8 +680,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mask0(i32 %s, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -742,8 +692,7 @@ define <4 x i32> @test_masked_z_i32_to_4
 define <4 x i32> @test_masked_i32_to_4_mask1(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -756,8 +705,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mask1(i32 %s, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -769,8 +717,7 @@ define <4 x i32> @test_masked_z_i32_to_4
 define <4 x i32> @test_masked_i32_to_4_mask2(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -783,8 +730,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mask2(i32 %s, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -796,8 +742,7 @@ define <4 x i32> @test_masked_z_i32_to_4
 define <4 x i32> @test_masked_i32_to_4_mask3(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -810,8 +755,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mask3(i32 %s, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -832,8 +776,7 @@ define <8 x i32> @test_i32_to_8(i32 %s)
 define <8 x i32> @test_masked_i32_to_8_mask0(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -846,8 +789,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mask0(i32 %s, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -859,8 +801,7 @@ define <8 x i32> @test_masked_z_i32_to_8
 define <8 x i32> @test_masked_i32_to_8_mask1(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -873,8 +814,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mask1(i32 %s, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -886,8 +826,7 @@ define <8 x i32> @test_masked_z_i32_to_8
 define <8 x i32> @test_masked_i32_to_8_mask2(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -900,8 +839,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mask2(i32 %s, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -913,8 +851,7 @@ define <8 x i32> @test_masked_z_i32_to_8
 define <8 x i32> @test_masked_i32_to_8_mask3(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -927,8 +864,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mask3(i32 %s, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -949,8 +885,7 @@ define <16 x i32> @test_i32_to_16(i32 %s
 define <16 x i32> @test_masked_i32_to_16_mask0(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -963,8 +898,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mask0(i32 %s, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -976,8 +910,7 @@ define <16 x i32> @test_masked_z_i32_to_
 define <16 x i32> @test_masked_i32_to_16_mask1(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -990,8 +923,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mask1(i32 %s, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1003,8 +935,7 @@ define <16 x i32> @test_masked_z_i32_to_
 define <16 x i32> @test_masked_i32_to_16_mask2(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1017,8 +948,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mask2(i32 %s, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1030,8 +960,7 @@ define <16 x i32> @test_masked_z_i32_to_
 define <16 x i32> @test_masked_i32_to_16_mask3(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1044,8 +973,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mask3(i32 %s, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i32> undef, i32 %s, i32 0
@@ -1066,8 +994,7 @@ define <2 x i64> @test_i64_to_2(i64 %s)
 define <2 x i64> @test_masked_i64_to_2_mask0(i64 %s, <2 x i64> %default, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_2_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1080,8 +1007,7 @@ define <2 x i64> @test_masked_i64_to_2_m
 define <2 x i64> @test_masked_z_i64_to_2_mask0(i64 %s, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_2_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1093,8 +1019,7 @@ define <2 x i64> @test_masked_z_i64_to_2
 define <2 x i64> @test_masked_i64_to_2_mask1(i64 %s, <2 x i64> %default, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_2_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1107,8 +1032,7 @@ define <2 x i64> @test_masked_i64_to_2_m
 define <2 x i64> @test_masked_z_i64_to_2_mask1(i64 %s, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_2_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1129,8 +1053,7 @@ define <4 x i64> @test_i64_to_4(i64 %s)
 define <4 x i64> @test_masked_i64_to_4_mask0(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1143,8 +1066,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mask0(i64 %s, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1156,8 +1078,7 @@ define <4 x i64> @test_masked_z_i64_to_4
 define <4 x i64> @test_masked_i64_to_4_mask1(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1170,8 +1091,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mask1(i64 %s, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1183,8 +1103,7 @@ define <4 x i64> @test_masked_z_i64_to_4
 define <4 x i64> @test_masked_i64_to_4_mask2(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1197,8 +1116,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mask2(i64 %s, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1210,8 +1128,7 @@ define <4 x i64> @test_masked_z_i64_to_4
 define <4 x i64> @test_masked_i64_to_4_mask3(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1224,8 +1141,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mask3(i64 %s, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1246,8 +1162,7 @@ define <8 x i64> @test_i64_to_8(i64 %s)
 define <8 x i64> @test_masked_i64_to_8_mask0(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1260,8 +1175,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mask0(i64 %s, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1273,8 +1187,7 @@ define <8 x i64> @test_masked_z_i64_to_8
 define <8 x i64> @test_masked_i64_to_8_mask1(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1287,8 +1200,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mask1(i64 %s, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1300,8 +1212,7 @@ define <8 x i64> @test_masked_z_i64_to_8
 define <8 x i64> @test_masked_i64_to_8_mask2(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1314,8 +1225,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mask2(i64 %s, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1327,8 +1237,7 @@ define <8 x i64> @test_masked_z_i64_to_8
 define <8 x i64> @test_masked_i64_to_8_mask3(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1341,8 +1250,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mask3(i64 %s, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x i64> undef, i64 %s, i32 0
@@ -1364,8 +1272,7 @@ define <16 x i8> @test_i8_to_16_mem(i8*
 define <16 x i8> @test_masked_i8_to_16_mem_mask0(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1379,8 +1286,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mem_mask0(i8* %p, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1393,8 +1299,7 @@ define <16 x i8> @test_masked_z_i8_to_16
 define <16 x i8> @test_masked_i8_to_16_mem_mask1(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1408,8 +1313,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mem_mask1(i8* %p, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1422,8 +1326,7 @@ define <16 x i8> @test_masked_z_i8_to_16
 define <16 x i8> @test_masked_i8_to_16_mem_mask2(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1437,8 +1340,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mem_mask2(i8* %p, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1451,8 +1353,7 @@ define <16 x i8> @test_masked_z_i8_to_16
 define <16 x i8> @test_masked_i8_to_16_mem_mask3(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_16_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1466,8 +1367,7 @@ define <16 x i8> @test_masked_i8_to_16_m
 define <16 x i8> @test_masked_z_i8_to_16_mem_mask3(i8* %p, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1490,8 +1390,7 @@ define <32 x i8> @test_i8_to_32_mem(i8*
 define <32 x i8> @test_masked_i8_to_32_mem_mask0(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1505,8 +1404,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mem_mask0(i8* %p, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1519,8 +1417,7 @@ define <32 x i8> @test_masked_z_i8_to_32
 define <32 x i8> @test_masked_i8_to_32_mem_mask1(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1534,8 +1431,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mem_mask1(i8* %p, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1548,8 +1444,7 @@ define <32 x i8> @test_masked_z_i8_to_32
 define <32 x i8> @test_masked_i8_to_32_mem_mask2(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1563,8 +1458,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mem_mask2(i8* %p, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1577,8 +1471,7 @@ define <32 x i8> @test_masked_z_i8_to_32
 define <32 x i8> @test_masked_i8_to_32_mem_mask3(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1592,8 +1485,7 @@ define <32 x i8> @test_masked_i8_to_32_m
 define <32 x i8> @test_masked_z_i8_to_32_mem_mask3(i8* %p, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1616,8 +1508,7 @@ define <64 x i8> @test_i8_to_64_mem(i8*
 define <64 x i8> @test_masked_i8_to_64_mem_mask0(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1631,8 +1522,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mem_mask0(i8* %p, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1645,8 +1535,7 @@ define <64 x i8> @test_masked_z_i8_to_64
 define <64 x i8> @test_masked_i8_to_64_mem_mask1(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1660,8 +1549,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mem_mask1(i8* %p, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1674,8 +1562,7 @@ define <64 x i8> @test_masked_z_i8_to_64
 define <64 x i8> @test_masked_i8_to_64_mem_mask2(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1689,8 +1576,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mem_mask2(i8* %p, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1703,8 +1589,7 @@ define <64 x i8> @test_masked_z_i8_to_64
 define <64 x i8> @test_masked_i8_to_64_mem_mask3(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_i8_to_64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1718,8 +1603,7 @@ define <64 x i8> @test_masked_i8_to_64_m
 define <64 x i8> @test_masked_z_i8_to_64_mem_mask3(i8* %p, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastb (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i8, i8* %p
@@ -1742,8 +1626,7 @@ define <8 x i16> @test_i16_to_8_mem(i16*
 define <8 x i16> @test_masked_i16_to_8_mem_mask0(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1757,8 +1640,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mem_mask0(i16* %p, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1771,8 +1653,7 @@ define <8 x i16> @test_masked_z_i16_to_8
 define <8 x i16> @test_masked_i16_to_8_mem_mask1(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1786,8 +1667,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mem_mask1(i16* %p, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1800,8 +1680,7 @@ define <8 x i16> @test_masked_z_i16_to_8
 define <8 x i16> @test_masked_i16_to_8_mem_mask2(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1815,8 +1694,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mem_mask2(i16* %p, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1829,8 +1707,7 @@ define <8 x i16> @test_masked_z_i16_to_8
 define <8 x i16> @test_masked_i16_to_8_mem_mask3(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_8_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1844,8 +1721,7 @@ define <8 x i16> @test_masked_i16_to_8_m
 define <8 x i16> @test_masked_z_i16_to_8_mem_mask3(i16* %p, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1868,8 +1744,7 @@ define <16 x i16> @test_i16_to_16_mem(i1
 define <16 x i16> @test_masked_i16_to_16_mem_mask0(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1883,8 +1758,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mem_mask0(i16* %p, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1897,8 +1771,7 @@ define <16 x i16> @test_masked_z_i16_to_
 define <16 x i16> @test_masked_i16_to_16_mem_mask1(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1912,8 +1785,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mem_mask1(i16* %p, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1926,8 +1798,7 @@ define <16 x i16> @test_masked_z_i16_to_
 define <16 x i16> @test_masked_i16_to_16_mem_mask2(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1941,8 +1812,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mem_mask2(i16* %p, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1955,8 +1825,7 @@ define <16 x i16> @test_masked_z_i16_to_
 define <16 x i16> @test_masked_i16_to_16_mem_mask3(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_16_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1970,8 +1839,7 @@ define <16 x i16> @test_masked_i16_to_16
 define <16 x i16> @test_masked_z_i16_to_16_mem_mask3(i16* %p, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -1994,8 +1862,7 @@ define <32 x i16> @test_i16_to_32_mem(i1
 define <32 x i16> @test_masked_i16_to_32_mem_mask0(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2009,8 +1876,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mem_mask0(i16* %p, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2023,8 +1889,7 @@ define <32 x i16> @test_masked_z_i16_to_
 define <32 x i16> @test_masked_i16_to_32_mem_mask1(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2038,8 +1903,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mem_mask1(i16* %p, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2052,8 +1916,7 @@ define <32 x i16> @test_masked_z_i16_to_
 define <32 x i16> @test_masked_i16_to_32_mem_mask2(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2067,8 +1930,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mem_mask2(i16* %p, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2081,8 +1943,7 @@ define <32 x i16> @test_masked_z_i16_to_
 define <32 x i16> @test_masked_i16_to_32_mem_mask3(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_i16_to_32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2096,8 +1957,7 @@ define <32 x i16> @test_masked_i16_to_32
 define <32 x i16> @test_masked_z_i16_to_32_mem_mask3(i16* %p, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastw (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i16, i16* %p
@@ -2120,8 +1980,7 @@ define <4 x i32> @test_i32_to_4_mem(i32*
 define <4 x i32> @test_masked_i32_to_4_mem_mask0(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2135,8 +1994,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mem_mask0(i32* %p, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2149,8 +2007,7 @@ define <4 x i32> @test_masked_z_i32_to_4
 define <4 x i32> @test_masked_i32_to_4_mem_mask1(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2164,8 +2021,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mem_mask1(i32* %p, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2178,8 +2034,7 @@ define <4 x i32> @test_masked_z_i32_to_4
 define <4 x i32> @test_masked_i32_to_4_mem_mask2(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2193,8 +2048,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mem_mask2(i32* %p, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2207,8 +2061,7 @@ define <4 x i32> @test_masked_z_i32_to_4
 define <4 x i32> @test_masked_i32_to_4_mem_mask3(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_4_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2222,8 +2075,7 @@ define <4 x i32> @test_masked_i32_to_4_m
 define <4 x i32> @test_masked_z_i32_to_4_mem_mask3(i32* %p, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2246,8 +2098,7 @@ define <8 x i32> @test_i32_to_8_mem(i32*
 define <8 x i32> @test_masked_i32_to_8_mem_mask0(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2261,8 +2112,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mem_mask0(i32* %p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2275,8 +2125,7 @@ define <8 x i32> @test_masked_z_i32_to_8
 define <8 x i32> @test_masked_i32_to_8_mem_mask1(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2290,8 +2139,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mem_mask1(i32* %p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2304,8 +2152,7 @@ define <8 x i32> @test_masked_z_i32_to_8
 define <8 x i32> @test_masked_i32_to_8_mem_mask2(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2319,8 +2166,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mem_mask2(i32* %p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2333,8 +2179,7 @@ define <8 x i32> @test_masked_z_i32_to_8
 define <8 x i32> @test_masked_i32_to_8_mem_mask3(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_8_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2348,8 +2193,7 @@ define <8 x i32> @test_masked_i32_to_8_m
 define <8 x i32> @test_masked_z_i32_to_8_mem_mask3(i32* %p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2372,8 +2216,7 @@ define <16 x i32> @test_i32_to_16_mem(i3
 define <16 x i32> @test_masked_i32_to_16_mem_mask0(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2387,8 +2230,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mem_mask0(i32* %p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2401,8 +2243,7 @@ define <16 x i32> @test_masked_z_i32_to_
 define <16 x i32> @test_masked_i32_to_16_mem_mask1(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2416,8 +2257,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mem_mask1(i32* %p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2430,8 +2270,7 @@ define <16 x i32> @test_masked_z_i32_to_
 define <16 x i32> @test_masked_i32_to_16_mem_mask2(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2445,8 +2284,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mem_mask2(i32* %p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2459,8 +2297,7 @@ define <16 x i32> @test_masked_z_i32_to_
 define <16 x i32> @test_masked_i32_to_16_mem_mask3(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_i32_to_16_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2474,8 +2311,7 @@ define <16 x i32> @test_masked_i32_to_16
 define <16 x i32> @test_masked_z_i32_to_16_mem_mask3(i32* %p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i32, i32* %p
@@ -2498,8 +2334,7 @@ define <2 x i64> @test_i64_to_2_mem(i64*
 define <2 x i64> @test_masked_i64_to_2_mem_mask0(i64* %p, <2 x i64> %default, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_2_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2513,8 +2348,7 @@ define <2 x i64> @test_masked_i64_to_2_m
 define <2 x i64> @test_masked_z_i64_to_2_mem_mask0(i64* %p, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2527,8 +2361,7 @@ define <2 x i64> @test_masked_z_i64_to_2
 define <2 x i64> @test_masked_i64_to_2_mem_mask1(i64* %p, <2 x i64> %default, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_2_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2542,8 +2375,7 @@ define <2 x i64> @test_masked_i64_to_2_m
 define <2 x i64> @test_masked_z_i64_to_2_mem_mask1(i64* %p, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2566,8 +2398,7 @@ define <4 x i64> @test_i64_to_4_mem(i64*
 define <4 x i64> @test_masked_i64_to_4_mem_mask0(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2581,8 +2412,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mem_mask0(i64* %p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2595,8 +2425,7 @@ define <4 x i64> @test_masked_z_i64_to_4
 define <4 x i64> @test_masked_i64_to_4_mem_mask1(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2610,8 +2439,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mem_mask1(i64* %p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2624,8 +2452,7 @@ define <4 x i64> @test_masked_z_i64_to_4
 define <4 x i64> @test_masked_i64_to_4_mem_mask2(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2639,8 +2466,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mem_mask2(i64* %p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2653,8 +2479,7 @@ define <4 x i64> @test_masked_z_i64_to_4
 define <4 x i64> @test_masked_i64_to_4_mem_mask3(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_4_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2668,8 +2493,7 @@ define <4 x i64> @test_masked_i64_to_4_m
 define <4 x i64> @test_masked_z_i64_to_4_mem_mask3(i64* %p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2692,8 +2516,7 @@ define <8 x i64> @test_i64_to_8_mem(i64*
 define <8 x i64> @test_masked_i64_to_8_mem_mask0(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2707,8 +2530,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mem_mask0(i64* %p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2721,8 +2543,7 @@ define <8 x i64> @test_masked_z_i64_to_8
 define <8 x i64> @test_masked_i64_to_8_mem_mask1(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2736,8 +2557,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mem_mask1(i64* %p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2750,8 +2570,7 @@ define <8 x i64> @test_masked_z_i64_to_8
 define <8 x i64> @test_masked_i64_to_8_mem_mask2(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2765,8 +2584,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mem_mask2(i64* %p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2779,8 +2597,7 @@ define <8 x i64> @test_masked_z_i64_to_8
 define <8 x i64> @test_masked_i64_to_8_mem_mask3(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_i64_to_8_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p
@@ -2794,8 +2611,7 @@ define <8 x i64> @test_masked_i64_to_8_m
 define <8 x i64> @test_masked_z_i64_to_8_mem_mask3(i64* %p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpbroadcastq (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load i64, i64* %p

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll Sat Jan 27 12:19:09 2018
@@ -14,8 +14,7 @@ define <4 x i32> @test_2xi32_to_4xi32(<4
 define <4 x i32> @test_masked_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -28,8 +27,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -40,8 +38,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 define <4 x i32> @test_masked_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -54,8 +51,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -66,8 +62,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 define <4 x i32> @test_masked_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -80,8 +75,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -92,8 +86,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 define <4 x i32> @test_masked_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -106,8 +99,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -126,8 +118,7 @@ define <8 x i32> @test_2xi32_to_8xi32(<8
 define <8 x i32> @test_masked_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -140,8 +131,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -152,8 +142,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 define <8 x i32> @test_masked_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -166,8 +155,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -178,8 +166,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 define <8 x i32> @test_masked_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -192,8 +179,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -204,8 +190,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 define <8 x i32> @test_masked_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -218,8 +203,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -238,8 +222,7 @@ define <16 x i32> @test_2xi32_to_16xi32(
 define <16 x i32> @test_masked_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -252,8 +235,7 @@ define <16 x i32> @test_masked_2xi32_to_
 define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -264,8 +246,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 define <16 x i32> @test_masked_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -278,8 +259,7 @@ define <16 x i32> @test_masked_2xi32_to_
 define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -290,8 +270,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 define <16 x i32> @test_masked_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -304,8 +283,7 @@ define <16 x i32> @test_masked_2xi32_to_
 define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -316,8 +294,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 define <16 x i32> @test_masked_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -330,8 +307,7 @@ define <16 x i32> @test_masked_2xi32_to_
 define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -351,8 +327,7 @@ define <4 x i32> @test_2xi32_to_4xi32_me
 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -365,8 +340,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -378,8 +352,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -392,8 +365,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -405,8 +377,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -419,8 +390,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -432,8 +402,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -446,8 +415,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -468,8 +436,7 @@ define <8 x i32> @test_2xi32_to_8xi32_me
 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -482,8 +449,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -495,8 +461,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -509,8 +474,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -522,8 +486,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -536,8 +499,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -549,8 +511,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -563,8 +524,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -589,8 +549,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -605,8 +564,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -620,8 +578,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -636,8 +593,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -651,8 +607,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -667,8 +622,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -682,8 +636,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -698,8 +651,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -720,8 +672,7 @@ define <8 x i32> @test_4xi32_to_8xi32_me
 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -734,8 +685,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -747,8 +697,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -761,8 +710,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -774,8 +722,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -788,8 +735,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -801,8 +747,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -815,8 +760,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -837,8 +781,7 @@ define <16 x i32> @test_4xi32_to_16xi32_
 define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -851,8 +794,7 @@ define <16 x i32> @test_masked_4xi32_to_
 define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -864,8 +806,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -878,8 +819,7 @@ define <16 x i32> @test_masked_4xi32_to_
 define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -891,8 +831,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -905,8 +844,7 @@ define <16 x i32> @test_masked_4xi32_to_
 define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -918,8 +856,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -932,8 +869,7 @@ define <16 x i32> @test_masked_4xi32_to_
 define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -954,8 +890,7 @@ define <4 x i64> @test_2xi64_to_4xi64_me
 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -968,8 +903,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -981,8 +915,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -995,8 +928,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1008,8 +940,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1022,8 +953,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1035,8 +965,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1049,8 +978,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1071,8 +999,7 @@ define <8 x i64> @test_2xi64_to_8xi64_me
 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1085,8 +1012,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1098,8 +1024,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1112,8 +1037,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1125,8 +1049,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1139,8 +1062,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1152,8 +1074,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1166,8 +1087,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1188,8 +1108,7 @@ define <16 x i32> @test_8xi32_to_16xi32_
 define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1202,8 +1121,7 @@ define <16 x i32> @test_masked_8xi32_to_
 define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1215,8 +1133,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1229,8 +1146,7 @@ define <16 x i32> @test_masked_8xi32_to_
 define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1242,8 +1158,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1256,8 +1171,7 @@ define <16 x i32> @test_masked_8xi32_to_
 define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1269,8 +1183,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1283,8 +1196,7 @@ define <16 x i32> @test_masked_8xi32_to_
 define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1305,8 +1217,7 @@ define <8 x i64> @test_4xi64_to_8xi64_me
 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1319,8 +1230,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1332,8 +1242,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1346,8 +1255,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1359,8 +1267,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1373,8 +1280,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1386,8 +1292,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1400,8 +1305,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp




More information about the llvm-commits mailing list