[llvm] r323605 - [X86] Use vptestm/vptestnm for comparisons with zero to avoid creating a zero vector.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Jan 27 12:19:09 PST 2018


Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Sat Jan 27 12:19:09 2018
@@ -22,8 +22,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -40,8 +39,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -57,8 +55,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -75,8 +72,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -92,8 +88,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -110,8 +105,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -139,8 +133,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -157,8 +150,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -189,8 +181,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -209,8 +200,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -229,8 +219,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -249,8 +238,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -269,8 +257,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -289,8 +276,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -321,8 +307,7 @@ define <8 x i16> @test_masked_16xi16_to_
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -340,8 +325,7 @@ define <8 x i16> @test_masked_z_16xi16_t
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -369,8 +353,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
@@ -384,8 +367,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -400,8 +382,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
@@ -415,8 +396,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -431,8 +411,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
@@ -446,8 +425,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -473,8 +451,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
@@ -488,8 +465,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -516,8 +492,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -533,8 +508,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -549,8 +523,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -566,8 +539,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -582,8 +554,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -599,8 +570,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -627,8 +597,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -644,8 +613,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -673,8 +641,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -690,8 +657,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -709,8 +675,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -726,8 +691,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -745,8 +709,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -762,8 +725,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -793,8 +755,7 @@ define <16 x i16> @test_masked_32xi16_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -810,8 +771,7 @@ define <16 x i16> @test_masked_z_32xi16_
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -843,8 +803,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -862,8 +821,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -881,8 +839,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -900,8 +857,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -919,8 +875,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -938,8 +893,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -971,8 +925,7 @@ define <8 x i16> @test_masked_32xi16_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -990,8 +943,7 @@ define <8 x i16> @test_masked_z_32xi16_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
 ; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1019,8 +971,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1036,8 +987,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1053,8 +1003,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1071,8 +1020,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1086,8 +1034,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1102,8 +1049,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1128,8 +1074,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -1145,8 +1090,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1173,8 +1117,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm2
 ; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1191,8 +1134,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1209,8 +1151,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1227,8 +1168,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1245,8 +1185,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1263,8 +1202,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1297,8 +1235,7 @@ define <4 x i32> @test_masked_8xi32_to_4
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1317,8 +1254,7 @@ define <4 x i32> @test_masked_z_8xi32_to
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1346,8 +1282,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
 ; CHECK-NEXT:    vpermi2d %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
@@ -1361,8 +1296,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1377,8 +1311,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
@@ -1392,8 +1325,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1408,8 +1340,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
@@ -1423,8 +1354,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1450,8 +1380,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
@@ -1465,8 +1394,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1493,8 +1421,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1510,8 +1437,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1526,8 +1452,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
 ; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1543,8 +1468,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
 ; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1559,8 +1483,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1576,8 +1499,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1604,8 +1526,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1621,8 +1542,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1645,8 +1565,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1660,8 +1579,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1678,8 +1596,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15]
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1695,8 +1612,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1714,8 +1630,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10]
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1731,8 +1646,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1762,8 +1676,7 @@ define <8 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12]
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1779,8 +1692,7 @@ define <8 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1812,8 +1724,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1831,8 +1742,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1850,8 +1760,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6]
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1869,8 +1778,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
 ; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1888,8 +1796,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1907,8 +1814,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1949,8 +1855,7 @@ define <4 x i32> @test_masked_16xi32_to_
 ; CHECK-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm3
 ; CHECK-NEXT:    vpextrd $2, %xmm2, %eax
 ; CHECK-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1973,8 +1878,7 @@ define <4 x i32> @test_masked_z_16xi32_t
 ; CHECK-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm2
 ; CHECK-NEXT:    vpextrd $2, %xmm1, %eax
 ; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1999,8 +1903,7 @@ define <2 x i64> @test_masked_4xi64_to_2
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -2015,8 +1918,7 @@ define <2 x i64> @test_masked_z_4xi64_to
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2030,8 +1932,7 @@ define <2 x i64> @test_masked_4xi64_to_2
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2046,8 +1947,7 @@ define <2 x i64> @test_masked_z_4xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2073,8 +1973,7 @@ define <2 x i64> @test_masked_4xi64_to_2
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm3[1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2090,8 +1989,7 @@ define <2 x i64> @test_masked_z_4xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2108,8 +2006,7 @@ define <2 x i64> @test_masked_4xi64_to_2
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2126,8 +2023,7 @@ define <2 x i64> @test_masked_z_4xi64_to
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2151,8 +2047,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2166,8 +2061,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
@@ -2181,8 +2075,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,0,2,5]
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
@@ -2196,8 +2089,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,2,5]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2212,8 +2104,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,7,2,7]
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
@@ -2227,8 +2118,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,7,2,7]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2254,8 +2144,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,4,4,3]
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
@@ -2269,8 +2158,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,4,4,3]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2285,8 +2173,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
@@ -2300,8 +2187,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2316,8 +2202,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,1,0,6]
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
@@ -2331,8 +2216,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2358,8 +2242,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,2,1,7]
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
@@ -2373,8 +2256,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,7]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2389,8 +2271,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
@@ -2404,8 +2285,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2430,8 +2310,7 @@ define <2 x i64> @test_masked_8xi64_to_2
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -2447,8 +2326,7 @@ define <2 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2462,8 +2340,7 @@ define <2 x i64> @test_masked_8xi64_to_2
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2478,8 +2355,7 @@ define <2 x i64> @test_masked_z_8xi64_to
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2500,8 +2376,7 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2514,8 +2389,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2532,8 +2406,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,3,2,4]
 ; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2549,8 +2422,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2568,8 +2440,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,5,5,1]
 ; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2585,8 +2456,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2616,8 +2486,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,0,0,2]
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2633,8 +2502,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2652,8 +2520,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,4,6,1]
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2669,8 +2536,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2688,8 +2554,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,7,1]
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2705,8 +2570,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2736,8 +2600,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,2,3,2]
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2753,8 +2616,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2772,8 +2634,7 @@ define <4 x i64> @test_masked_8xi64_to_4
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,1,5]
 ; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm4
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -2789,8 +2650,7 @@ define <4 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2820,8 +2680,7 @@ define <2 x i64> @test_masked_8xi64_to_2
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti32x4 $2, %zmm2, %xmm3
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2838,8 +2697,7 @@ define <2 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; CHECK-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2857,8 +2715,7 @@ define <2 x i64> @test_masked_8xi64_to_2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -2876,8 +2733,7 @@ define <2 x i64> @test_masked_z_8xi64_to
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll Sat Jan 27 12:19:09 2018
@@ -14,8 +14,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -29,8 +28,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -42,8 +40,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -57,8 +54,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -70,8 +66,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -85,8 +80,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -107,8 +101,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -122,8 +115,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -145,8 +137,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -160,8 +151,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -175,8 +165,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -190,8 +179,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -205,8 +193,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -220,8 +207,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -245,8 +231,7 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -260,8 +245,7 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -284,8 +268,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -299,8 +282,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
@@ -312,8 +294,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -327,8 +308,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
@@ -340,8 +320,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -355,8 +334,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
@@ -377,8 +355,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -392,8 +369,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
@@ -415,8 +391,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -430,8 +405,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -445,8 +419,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -460,8 +433,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -475,8 +447,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -490,8 +461,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -515,8 +485,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -530,8 +499,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -554,8 +522,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -569,8 +536,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
@@ -582,8 +548,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -597,8 +562,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
@@ -610,8 +574,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -625,8 +588,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
@@ -647,8 +609,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -662,8 +623,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
@@ -685,8 +645,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -700,8 +659,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -715,8 +673,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -730,8 +687,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -745,8 +701,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -760,8 +715,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -785,8 +739,7 @@ define <8 x i32> @test_masked_8xi32_perm
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -800,8 +753,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -824,8 +776,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -839,8 +790,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
@@ -852,8 +802,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -867,8 +816,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
@@ -880,8 +828,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -895,8 +842,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
@@ -917,8 +863,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -932,8 +877,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
@@ -955,8 +899,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -970,8 +913,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -985,8 +927,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1000,8 +941,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1015,8 +955,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1030,8 +969,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1055,8 +993,7 @@ define <16 x i32> @test_masked_16xi32_pe
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1070,8 +1007,7 @@ define <16 x i32> @test_masked_z_16xi32_
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1092,8 +1028,7 @@ define <4 x i64> @test_4xi64_perm_mask0(
 define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1106,8 +1041,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -1118,8 +1052,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1132,8 +1065,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -1144,8 +1076,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1158,8 +1089,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
@@ -1178,8 +1108,7 @@ define <4 x i64> @test_4xi64_perm_mask3(
 define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1192,8 +1121,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
@@ -1213,8 +1141,7 @@ define <4 x i64> @test_4xi64_perm_mem_ma
 define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1227,8 +1154,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1241,8 +1167,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1255,8 +1180,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1269,8 +1193,7 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1283,8 +1206,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1306,8 +1228,7 @@ define <4 x i64> @test_4xi64_perm_mem_ma
 define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1320,8 +1241,7 @@ define <4 x i64> @test_masked_4xi64_perm
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1344,8 +1264,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1359,8 +1278,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
@@ -1371,8 +1289,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1385,8 +1302,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
@@ -1398,8 +1314,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1413,8 +1328,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
@@ -1433,8 +1347,7 @@ define <8 x i64> @test_8xi64_perm_imm_ma
 define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1447,8 +1360,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
@@ -1460,8 +1372,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1475,8 +1386,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
@@ -1487,8 +1397,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1501,8 +1410,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -1523,8 +1431,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1538,8 +1445,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
@@ -1550,8 +1456,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1564,8 +1469,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
@@ -1587,8 +1491,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1602,8 +1505,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1616,8 +1518,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1630,8 +1531,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1645,8 +1545,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1660,8 +1559,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1683,8 +1581,7 @@ define <8 x i64> @test_8xi64_perm_imm_me
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1697,8 +1594,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1712,8 +1608,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1727,8 +1622,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1741,8 +1635,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1755,8 +1648,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1780,8 +1672,7 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1795,8 +1686,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1809,8 +1699,7 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
@@ -1823,8 +1712,7 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll Sat Jan 27 12:19:09 2018
@@ -1030,8 +1030,7 @@ define <8 x i32> @test_8xi32_shuff_mask0
 define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1044,8 +1043,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1056,8 +1054,7 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1070,8 +1067,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1082,8 +1078,7 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1096,8 +1091,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1116,8 +1110,7 @@ define <8 x i32> @test_8xi32_shuff_mask3
 define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1130,8 +1123,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1151,8 +1143,7 @@ define <8 x i32> @test_8xi32_shuff_mem_m
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1166,8 +1157,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1180,8 +1170,7 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1195,8 +1184,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1209,8 +1197,7 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1224,8 +1211,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1247,8 +1233,7 @@ define <8 x i32> @test_8xi32_shuff_mem_m
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1262,8 +1247,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1284,8 +1268,7 @@ define <16 x i32> @test_16xi32_shuff_mas
 define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1298,8 +1281,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -1310,8 +1292,7 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1324,8 +1305,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -1336,8 +1316,7 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1350,8 +1329,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -1370,8 +1348,7 @@ define <16 x i32> @test_16xi32_shuff_mas
 define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1384,8 +1361,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -1405,8 +1381,7 @@ define <16 x i32> @test_16xi32_shuff_mem
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1420,8 +1395,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1434,8 +1408,7 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1449,8 +1422,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1463,8 +1435,7 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1478,8 +1449,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1501,8 +1471,7 @@ define <16 x i32> @test_16xi32_shuff_mem
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1516,8 +1485,7 @@ define <16 x i32> @test_16xi32_masked_sh
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1538,8 +1506,7 @@ define <4 x i64> @test_4xi64_shuff_mask0
 define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1552,8 +1519,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1564,8 +1530,7 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1578,8 +1543,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1590,8 +1554,7 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1604,8 +1567,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1624,8 +1586,7 @@ define <4 x i64> @test_4xi64_shuff_mask3
 define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1638,8 +1599,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1659,8 +1619,7 @@ define <4 x i64> @test_4xi64_shuff_mem_m
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1674,8 +1633,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1688,8 +1646,7 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1703,8 +1660,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1717,8 +1673,7 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1732,8 +1687,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1755,8 +1709,7 @@ define <4 x i64> @test_4xi64_shuff_mem_m
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1770,8 +1723,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1792,8 +1744,7 @@ define <8 x i64> @test_8xi64_shuff_mask0
 define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1806,8 +1757,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
@@ -1818,8 +1768,7 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1832,8 +1781,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
@@ -1844,8 +1792,7 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1858,8 +1805,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
@@ -1878,8 +1824,7 @@ define <8 x i64> @test_8xi64_shuff_mask3
 define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1892,8 +1837,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
@@ -1913,8 +1857,7 @@ define <8 x i64> @test_8xi64_shuff_mem_m
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1928,8 +1871,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1942,8 +1884,7 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1957,8 +1898,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1971,8 +1911,7 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1986,8 +1925,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -2009,8 +1947,7 @@ define <8 x i64> @test_8xi64_shuff_mem_m
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2024,8 +1961,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll Sat Jan 27 12:19:09 2018
@@ -12,8 +12,7 @@ define <16 x i8> @test_16xi8_perm_mask0(
 define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -26,8 +25,7 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -38,8 +36,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -52,8 +49,7 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -64,8 +60,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -78,8 +73,7 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -98,8 +92,7 @@ define <16 x i8> @test_16xi8_perm_mask3(
 define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -112,8 +105,7 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -135,8 +127,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -150,8 +141,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -165,8 +155,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -180,8 +169,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -195,8 +183,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -210,8 +197,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -235,8 +221,7 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -250,8 +235,7 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i8>, <16 x i8>* %vp
@@ -272,8 +256,7 @@ define <32 x i8> @test_32xi8_perm_mask0(
 define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -286,8 +269,7 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
@@ -298,8 +280,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -312,8 +293,7 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
@@ -324,8 +304,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -338,8 +317,7 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
@@ -358,8 +336,7 @@ define <32 x i8> @test_32xi8_perm_mask3(
 define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -372,8 +349,7 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
@@ -395,8 +371,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -410,8 +385,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -425,8 +399,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -440,8 +413,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -455,8 +427,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -470,8 +441,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -495,8 +465,7 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -510,8 +479,7 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %vp
@@ -532,8 +500,7 @@ define <64 x i8> @test_64xi8_perm_mask0(
 define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -546,8 +513,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
@@ -558,8 +524,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -572,8 +537,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
@@ -584,8 +548,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -598,8 +561,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
@@ -618,8 +580,7 @@ define <64 x i8> @test_64xi8_perm_mask3(
 define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -632,8 +593,7 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
@@ -655,8 +615,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -670,8 +629,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -685,8 +643,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -700,8 +657,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -715,8 +671,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -730,8 +685,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -755,8 +709,7 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -770,8 +723,7 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
 ; CHECK-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %vp
@@ -792,8 +744,7 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -806,8 +757,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
@@ -818,8 +768,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -832,8 +781,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -844,8 +792,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -858,8 +805,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
@@ -878,8 +824,7 @@ define <8 x i16> @test_8xi16_perm_low_ma
 define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -892,8 +837,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -904,8 +848,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -918,8 +861,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
@@ -930,8 +872,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -944,8 +885,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -964,8 +904,7 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -978,8 +917,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
@@ -990,8 +928,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1004,8 +941,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -1025,8 +961,7 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1039,8 +974,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1053,8 +987,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1067,8 +1000,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1081,8 +1013,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1095,8 +1026,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1118,8 +1048,7 @@ define <8 x i16> @test_8xi16_perm_low_me
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1132,8 +1061,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1146,8 +1074,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1160,8 +1087,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1174,8 +1100,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1188,8 +1113,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1211,8 +1135,7 @@ define <8 x i16> @test_8xi16_perm_high_m
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1225,8 +1148,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1239,8 +1161,7 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1253,8 +1174,7 @@ define <8 x i16> @test_masked_8xi16_perm
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i16>, <8 x i16>* %vp
@@ -1275,8 +1195,7 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1289,8 +1208,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
@@ -1301,8 +1219,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1315,8 +1232,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -1327,8 +1243,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1341,8 +1256,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
@@ -1361,8 +1275,7 @@ define <16 x i16> @test_16xi16_perm_low_
 define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1375,8 +1288,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -1387,8 +1299,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1401,8 +1312,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
@@ -1413,8 +1323,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1427,8 +1336,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -1447,8 +1355,7 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1461,8 +1368,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
@@ -1473,8 +1379,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1487,8 +1392,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -1508,8 +1412,7 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1522,8 +1425,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1536,8 +1438,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1550,8 +1451,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1564,8 +1464,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1578,8 +1477,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1601,8 +1499,7 @@ define <16 x i16> @test_16xi16_perm_low_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1615,8 +1512,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1629,8 +1525,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1643,8 +1538,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1657,8 +1551,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1671,8 +1564,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1694,8 +1586,7 @@ define <16 x i16> @test_16xi16_perm_high
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1708,8 +1599,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1722,8 +1612,7 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1736,8 +1625,7 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
@@ -1758,8 +1646,7 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1772,8 +1659,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
@@ -1784,8 +1670,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1798,8 +1683,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -1810,8 +1694,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1824,8 +1707,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
@@ -1844,8 +1726,7 @@ define <32 x i16> @test_32xi16_perm_low_
 define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1858,8 +1739,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1870,8 +1750,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1884,8 +1763,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
@@ -1896,8 +1774,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1910,8 +1787,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -1930,8 +1806,7 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1944,8 +1819,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
@@ -1956,8 +1830,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1970,8 +1843,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -1991,8 +1863,7 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2005,8 +1876,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2019,8 +1889,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2033,8 +1902,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2047,8 +1915,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2061,8 +1928,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2084,8 +1950,7 @@ define <32 x i16> @test_32xi16_perm_low_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2098,8 +1963,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2112,8 +1976,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2126,8 +1989,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2141,8 +2003,7 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2156,8 +2017,7 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2179,8 +2039,7 @@ define <32 x i16> @test_32xi16_perm_high
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2193,8 +2052,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2207,8 +2065,7 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2221,8 +2078,7 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
@@ -2243,8 +2099,7 @@ define <4 x i32> @test_4xi32_perm_mask0(
 define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -2257,8 +2112,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
@@ -2269,8 +2123,7 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -2283,8 +2136,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
@@ -2295,8 +2147,7 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -2309,8 +2160,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
@@ -2329,8 +2179,7 @@ define <4 x i32> @test_4xi32_perm_mask3(
 define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -2343,8 +2192,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
@@ -2364,8 +2212,7 @@ define <4 x i32> @test_4xi32_perm_mem_ma
 define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2378,8 +2225,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2392,8 +2238,7 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2406,8 +2251,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2420,8 +2264,7 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2434,8 +2277,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2457,8 +2299,7 @@ define <4 x i32> @test_4xi32_perm_mem_ma
 define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2471,8 +2312,7 @@ define <4 x i32> @test_masked_4xi32_perm
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -2493,8 +2333,7 @@ define <8 x i32> @test_8xi32_perm_mask0(
 define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2507,8 +2346,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
@@ -2519,8 +2357,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2533,8 +2370,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
@@ -2545,8 +2381,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2559,8 +2394,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
@@ -2579,8 +2413,7 @@ define <8 x i32> @test_8xi32_perm_mask3(
 define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2593,8 +2426,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
@@ -2614,8 +2446,7 @@ define <8 x i32> @test_8xi32_perm_mem_ma
 define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2628,8 +2459,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2642,8 +2472,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2656,8 +2485,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2670,8 +2498,7 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2684,8 +2511,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2707,8 +2533,7 @@ define <8 x i32> @test_8xi32_perm_mem_ma
 define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2721,8 +2546,7 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -2743,8 +2567,7 @@ define <16 x i32> @test_16xi32_perm_mask
 define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2757,8 +2580,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
@@ -2769,8 +2591,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2783,8 +2604,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
@@ -2795,8 +2615,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2809,8 +2628,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
@@ -2829,8 +2647,7 @@ define <16 x i32> @test_16xi32_perm_mask
 define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2843,8 +2660,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
@@ -2864,8 +2680,7 @@ define <16 x i32> @test_16xi32_perm_mem_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2878,8 +2693,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2892,8 +2706,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2906,8 +2719,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2920,8 +2732,7 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2934,8 +2745,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2957,8 +2767,7 @@ define <16 x i32> @test_16xi32_perm_mem_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2971,8 +2780,7 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-vbroadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vbroadcast.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vbroadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vbroadcast.ll Sat Jan 27 12:19:09 2018
@@ -44,8 +44,7 @@ define   <16 x float> @_inreg16xfloat(fl
 define   <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
 ; ALL-LABEL: _ss16xfloat_mask:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; ALL-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; ALL-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
 ; ALL-NEXT:    vmovaps %zmm1, %zmm0
 ; ALL-NEXT:    retq
@@ -59,8 +58,7 @@ define   <16 x float> @_ss16xfloat_mask(
 define   <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
 ; ALL-LABEL: _ss16xfloat_maskz:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; ALL-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; ALL-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -84,8 +82,7 @@ define   <16 x float> @_ss16xfloat_load(
 define   <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
 ; ALL-LABEL: _ss16xfloat_mask_load:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; ALL-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
 ; ALL-NEXT:    retq
   %a = load float, float* %a.ptr
@@ -99,8 +96,7 @@ define   <16 x float> @_ss16xfloat_mask_
 define   <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
 ; ALL-LABEL: _ss16xfloat_maskz_load:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; ALL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; ALL-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
 ; ALL-NEXT:    retq
   %a = load float, float* %a.ptr
@@ -125,8 +121,7 @@ define   <8 x double> @_sd8xdouble_mask(
 ; ALL-LABEL: _sd8xdouble_mask:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    # kill: def %ymm2 killed %ymm2 def %zmm2
-; ALL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; ALL-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; ALL-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; ALL-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
 ; ALL-NEXT:    vmovapd %zmm1, %zmm0
 ; ALL-NEXT:    retq
@@ -141,8 +136,7 @@ define   <8 x double> @_sd8xdouble_maskz
 ; ALL-LABEL: _sd8xdouble_maskz:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
-; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; ALL-NEXT:    retq
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -167,8 +161,7 @@ define   <8 x double> @_sd8xdouble_mask_
 ; ALL-LABEL: _sd8xdouble_mask_load:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
-; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; ALL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; ALL-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
 ; ALL-NEXT:    retq
   %a = load double, double* %a.ptr
@@ -183,8 +176,7 @@ define   <8 x double> @_sd8xdouble_maskz
 ; ALL-LABEL: _sd8xdouble_maskz_load:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; ALL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; ALL-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
 ; ALL-NEXT:    retq
   %a = load double, double* %a.ptr

Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Sat Jan 27 12:19:09 2018
@@ -926,8 +926,7 @@ define <2 x i64> @test46(<2 x float> %x,
 define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
 ; KNL-LABEL: test47:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
@@ -938,8 +937,7 @@ define <16 x i8> @test47(<16 x i32> %a,
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    ## kill: def %xmm2 killed %xmm2 def %zmm2
 ; AVX512BW-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
-; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -947,8 +945,7 @@ define <16 x i8> @test47(<16 x i32> %a,
 ;
 ; SKX-LABEL: test47:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; SKX-NEXT:    vpblendmb %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -960,8 +957,7 @@ define <16 x i8> @test47(<16 x i32> %a,
 define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) {
 ; KNL-LABEL: test48:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
@@ -971,16 +967,14 @@ define <16 x i16> @test48(<16 x i32> %a,
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    ## kill: def %ymm2 killed %ymm2 def %zmm2
 ; AVX512BW-NEXT:    ## kill: def %ymm1 killed %ymm1 def %zmm1
-; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    ## kill: def %ymm0 killed %ymm0 killed %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; SKX-LABEL: test48:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT:    vpcmpeqd %zmm3, %zmm0, %k1
+; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; SKX-NEXT:    vpblendmw %ymm1, %ymm2, %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %cmp = icmp eq <16 x i32> %a, zeroinitializer
@@ -991,8 +985,7 @@ define <16 x i16> @test48(<16 x i32> %a,
 define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) {
 ; KNL-LABEL: test49:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpeqq %zmm3, %zmm0, %k1
+; KNL-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
@@ -1003,8 +996,7 @@ define <8 x i16> @test49(<8 x i64> %a, <
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    ## kill: def %xmm2 killed %xmm2 def %zmm2
 ; AVX512BW-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
-; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpcmpeqq %zmm3, %zmm0, %k1
+; AVX512BW-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1012,8 +1004,7 @@ define <8 x i16> @test49(<8 x i64> %a, <
 ;
 ; SKX-LABEL: test49:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT:    vpcmpeqq %zmm3, %zmm0, %k1
+; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1
 ; SKX-NEXT:    vpblendmw %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-mov.ll Sat Jan 27 12:19:09 2018
@@ -24,8 +24,7 @@ define void @test2(i8 * %addr, <64 x i8>
 define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
 ; CHECK-LABEL: test3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmb %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <64 x i8> %mask1, zeroinitializer
@@ -38,8 +37,7 @@ define <64 x i8> @test3(i8 * %addr, <64
 define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
 ; CHECK-LABEL: test4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpneqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestmb %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <64 x i8> %mask1, zeroinitializer
@@ -72,8 +70,7 @@ define void @test6(i8 * %addr, <32 x i16
 define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
 ; CHECK-LABEL: test7:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqw %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vptestmw %zmm1, %zmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <32 x i16> %mask1, zeroinitializer
@@ -86,8 +83,7 @@ define <32 x i16> @test7(i8 * %addr, <32
 define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
 ; CHECK-LABEL: test8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpneqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vptestmw %zmm0, %zmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <32 x i16> %mask1, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-mov.ll Sat Jan 27 12:19:09 2018
@@ -24,8 +24,7 @@ define void @test_256_2(i8 * %addr, <32
 define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
 ; CHECK-LABEL: test_256_3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT:    vptestmb %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x26,0xc9]
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <32 x i8> %mask1, zeroinitializer
@@ -38,8 +37,7 @@ define <32 x i8> @test_256_3(i8 * %addr,
 define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
 ; CHECK-LABEL: test_256_4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT:    vptestmb %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc8]
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <32 x i8> %mask1, zeroinitializer
@@ -72,8 +70,7 @@ define void @test_256_6(i8 * %addr, <16
 define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
 ; CHECK-LABEL: test_256_7:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT:    vptestmw %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x26,0xc9]
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i16> %mask1, zeroinitializer
@@ -86,8 +83,7 @@ define <16 x i16> @test_256_7(i8 * %addr
 define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
 ; CHECK-LABEL: test_256_8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT:    vptestmw %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc8]
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i16> %mask1, zeroinitializer
@@ -120,8 +116,7 @@ define void @test_128_2(i8 * %addr, <16
 define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
 ; CHECK-LABEL: test_128_3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT:    vptestmb %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x26,0xc9]
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i8> %mask1, zeroinitializer
@@ -134,8 +129,7 @@ define <16 x i8> @test_128_3(i8 * %addr,
 define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
 ; CHECK-LABEL: test_128_4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT:    vptestmb %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc8]
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i8> %mask1, zeroinitializer
@@ -168,8 +162,7 @@ define void @test_128_6(i8 * %addr, <8 x
 define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
 ; CHECK-LABEL: test_128_7:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT:    vptestmw %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x26,0xc9]
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i16> %mask1, zeroinitializer
@@ -182,8 +175,7 @@ define <8 x i16> @test_128_7(i8 * %addr,
 define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
 ; CHECK-LABEL: test_128_8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT:    vptestmw %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc8]
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i16> %mask1, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-arith.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-arith.ll Sat Jan 27 12:19:09 2018
@@ -76,8 +76,7 @@ define <8 x i32> @vpaddd256_broadcast_te
 define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd256_mask_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmd %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x28,0x27,0xca]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -89,8 +88,7 @@ define <8 x i32> @vpaddd256_mask_test(<8
 define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd256_maskz_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmd %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x28,0x27,0xca]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -102,8 +100,7 @@ define <8 x i32> @vpaddd256_maskz_test(<
 define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd256_mask_fold_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -116,8 +113,7 @@ define <8 x i32> @vpaddd256_mask_fold_te
 define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd256_mask_broadcast_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x05,A,A,A,A]
 ; CHECK-NEXT:    ## fixup A - offset: 6, value: LCPI10_0-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -130,8 +126,7 @@ define <8 x i32> @vpaddd256_mask_broadca
 define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd256_maskz_fold_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -144,8 +139,7 @@ define <8 x i32> @vpaddd256_maskz_fold_t
 define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd256_maskz_broadcast_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x05,A,A,A,A]
 ; CHECK-NEXT:    ## fixup A - offset: 6, value: LCPI12_0-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -216,8 +210,7 @@ define <8 x float> @test_broadcast_vaddp
 define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vaddps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -229,8 +222,7 @@ define <8 x float> @test_mask_vaddps_256
 define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmulps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vmulps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x59,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -242,8 +234,7 @@ define <8 x float> @test_mask_vmulps_256
 define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1)nounwind readnone {
 ; CHECK-LABEL: test_mask_vminps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vminps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5d,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -256,8 +247,7 @@ define <8 x float> @test_mask_vminps_256
 define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmaxps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vmaxps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -270,8 +260,7 @@ define <8 x float> @test_mask_vmaxps_256
 define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vsubps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vsubps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5c,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -283,8 +272,7 @@ define <8 x float> @test_mask_vsubps_256
 define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vdivps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0x65,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vdivps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5e,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -296,8 +284,7 @@ define <8 x float> @test_mask_vdivps_256
 define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmulpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vmulpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x59,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -309,8 +296,7 @@ define <4 x double> @test_mask_vmulpd_25
 define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vminpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vminpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5d,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -323,8 +309,7 @@ define <4 x double> @test_mask_vminpd_25
 define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmaxpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vmaxpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -337,8 +322,7 @@ define <4 x double> @test_mask_vmaxpd_25
 define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vsubpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vsubpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5c,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -350,8 +334,7 @@ define <4 x double> @test_mask_vsubpd_25
 define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vdivpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vdivpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5e,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -363,8 +346,7 @@ define <4 x double> @test_mask_vdivpd_25
 define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %ymm3, %ymm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x28,0x27,0xcb]
 ; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -376,8 +358,7 @@ define <4 x double> @test_mask_vaddpd_25
 define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_maskz_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -389,8 +370,7 @@ define <4 x double> @test_maskz_vaddpd_2
 define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double>* %j,  <4 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_mask_fold_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
 ; CHECK-NEXT:    vaddpd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -403,8 +383,7 @@ define <4 x double> @test_mask_fold_vadd
 define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_maskz_fold_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -429,8 +408,7 @@ define <4 x double> @test_broadcast2_vad
 define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_mask_broadcast_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT:    vpcmpneqq %ymm0, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xc8,0x04]
+; CHECK-NEXT:    vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
 ; CHECK-NEXT:    vaddpd (%rdi){1to4}, %ymm1, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x39,0x58,0x0f]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -446,8 +424,7 @@ define <4 x double> @test_mask_broadcast
 define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_maskz_broadcast_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vaddpd (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -524,8 +501,7 @@ define <4 x i32> @vpaddd128_broadcast_te
 define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd128_mask_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmd %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x08,0x27,0xca]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -537,8 +513,7 @@ define <4 x i32> @vpaddd128_mask_test(<4
 define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd128_maskz_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmd %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0x6d,0x08,0x27,0xca]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -550,8 +525,7 @@ define <4 x i32> @vpaddd128_maskz_test(<
 define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd128_mask_fold_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -564,8 +538,7 @@ define <4 x i32> @vpaddd128_mask_fold_te
 define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd128_mask_broadcast_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x05,A,A,A,A]
 ; CHECK-NEXT:    ## fixup A - offset: 6, value: LCPI46_0-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -578,8 +551,7 @@ define <4 x i32> @vpaddd128_mask_broadca
 define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd128_maskz_fold_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -592,8 +564,7 @@ define <4 x i32> @vpaddd128_maskz_fold_t
 define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: vpaddd128_maskz_broadcast_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x05,A,A,A,A]
 ; CHECK-NEXT:    ## fixup A - offset: 6, value: LCPI48_0-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -664,8 +635,7 @@ define <4 x float> @test_broadcast_vaddp
 define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vaddps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -677,8 +647,7 @@ define <4 x float> @test_mask_vaddps_128
 define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmulps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vmulps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x59,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -690,8 +659,7 @@ define <4 x float> @test_mask_vmulps_128
 define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vminps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vminps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5d,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -704,8 +672,7 @@ define <4 x float> @test_mask_vminps_128
 define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmaxps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vmaxps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -718,8 +685,7 @@ define <4 x float> @test_mask_vmaxps_128
 define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vsubps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vsubps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5c,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -732,8 +698,7 @@ define <4 x float> @test_mask_vsubps_128
 define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vdivps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmd %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0x65,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vdivps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5e,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -745,8 +710,7 @@ define <4 x float> @test_mask_vdivps_128
 define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmulpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vmulpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x59,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -758,8 +722,7 @@ define <2 x double> @test_mask_vmulpd_12
 define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vminpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vminpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5d,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -772,8 +735,7 @@ define <2 x double> @test_mask_vminpd_12
 define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vmaxpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vmaxpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -786,8 +748,7 @@ define <2 x double> @test_mask_vmaxpd_12
 define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vsubpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vsubpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5c,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -799,8 +760,7 @@ define <2 x double> @test_mask_vsubpd_12
 define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vdivpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vdivpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5e,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -812,8 +772,7 @@ define <2 x double> @test_mask_vdivpd_12
 define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
 ; CHECK-LABEL: test_mask_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; CHECK-NEXT:    vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
+; CHECK-NEXT:    vptestmq %xmm3, %xmm3, %k1 ## encoding: [0x62,0xf2,0xe5,0x08,0x27,0xcb]
 ; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -825,8 +784,7 @@ define <2 x double> @test_mask_vaddpd_12
 define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j,
 ; CHECK-LABEL: test_maskz_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
                                           <2 x i64> %mask1) nounwind readnone {
@@ -839,8 +797,7 @@ define <2 x double> @test_maskz_vaddpd_1
 define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double>* %j,  <2 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_mask_fold_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
-; CHECK-NEXT:    vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04]
+; CHECK-NEXT:    vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
 ; CHECK-NEXT:    vaddpd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -853,8 +810,7 @@ define <2 x double> @test_mask_fold_vadd
 define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_maskz_fold_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -879,8 +835,7 @@ define <2 x double> @test_broadcast2_vad
 define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_mask_broadcast_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT:    vpcmpneqq %xmm0, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xc8,0x04]
+; CHECK-NEXT:    vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
 ; CHECK-NEXT:    vaddpd (%rdi){1to2}, %xmm1, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x19,0x58,0x0f]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -896,8 +851,7 @@ define <2 x double> @test_mask_broadcast
 define <2 x double> @test_maskz_broadcast_vaddpd_128(<2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_maskz_broadcast_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vaddpd (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-mov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-mov.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-mov.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-mov.ll Sat Jan 27 12:19:09 2018
@@ -164,8 +164,7 @@ define <8 x float> @test_256_16(i8 * %ad
 define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
 ; CHECK-LABEL: test_256_17:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -178,8 +177,7 @@ define <8 x i32> @test_256_17(i8 * %addr
 define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
 ; CHECK-LABEL: test_256_18:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0x75,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -192,8 +190,7 @@ define <8 x i32> @test_256_18(i8 * %addr
 define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
 ; CHECK-LABEL: test_256_19:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -206,8 +203,7 @@ define <8 x i32> @test_256_19(i8 * %addr
 define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
 ; CHECK-LABEL: test_256_20:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -220,8 +216,7 @@ define <8 x i32> @test_256_20(i8 * %addr
 define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_21:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -234,8 +229,7 @@ define <4 x i64> @test_256_21(i8 * %addr
 define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_22:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -248,8 +242,7 @@ define <4 x i64> @test_256_22(i8 * %addr
 define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_23:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -262,8 +255,7 @@ define <4 x i64> @test_256_23(i8 * %addr
 define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_24:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -332,8 +324,7 @@ define <8 x float> @test_256_28(i8 * %ad
 define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_29:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -346,8 +337,7 @@ define <4 x double> @test_256_29(i8 * %a
 define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_30:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %ymm1, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x27,0xc9]
 ; CHECK-NEXT:    vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -360,8 +350,7 @@ define <4 x double> @test_256_30(i8 * %a
 define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_31:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -374,8 +363,7 @@ define <4 x double> @test_256_31(i8 * %a
 define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
 ; CHECK-LABEL: test_256_32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %ymm0, %ymm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc8]
 ; CHECK-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
@@ -548,8 +536,7 @@ define <4 x float> @test_128_16(i8 * %ad
 define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_17:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -562,8 +549,7 @@ define <4 x i32> @test_128_17(i8 * %addr
 define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_18:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -576,8 +562,7 @@ define <4 x i32> @test_128_18(i8 * %addr
 define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_19:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -590,8 +575,7 @@ define <4 x i32> @test_128_19(i8 * %addr
 define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_20:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -604,8 +588,7 @@ define <4 x i32> @test_128_20(i8 * %addr
 define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_21:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -618,8 +601,7 @@ define <2 x i64> @test_128_21(i8 * %addr
 define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_22:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -632,8 +614,7 @@ define <2 x i64> @test_128_22(i8 * %addr
 define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_23:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -646,8 +627,7 @@ define <2 x i64> @test_128_23(i8 * %addr
 define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_24:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovdqu64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -660,8 +640,7 @@ define <2 x i64> @test_128_24(i8 * %addr
 define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_25:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -674,8 +653,7 @@ define <4 x float> @test_128_25(i8 * %ad
 define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_26:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -688,8 +666,7 @@ define <4 x float> @test_128_26(i8 * %ad
 define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_27:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -702,8 +679,7 @@ define <4 x float> @test_128_27(i8 * %ad
 define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
 ; CHECK-LABEL: test_128_28:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -716,8 +692,7 @@ define <4 x float> @test_128_28(i8 * %ad
 define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_29:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -730,8 +705,7 @@ define <2 x double> @test_128_29(i8 * %a
 define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_30:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x27,0xc9]
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -744,8 +718,7 @@ define <2 x double> @test_128_30(i8 * %a
 define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_31:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
@@ -758,8 +731,7 @@ define <2 x double> @test_128_31(i8 * %a
 define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
 ; CHECK-LABEL: test_128_32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll Sat Jan 27 12:19:09 2018
@@ -70,8 +70,7 @@ define   <8 x float> @_inreg8xfloat(floa
 define   <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1) {
 ; CHECK-LABEL: _ss8xfloat_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vptestmd %ymm2, %ymm2, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -84,8 +83,7 @@ define   <8 x float> @_ss8xfloat_mask(<8
 define   <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) {
 ; CHECK-LABEL: _ss8xfloat_maskz:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vptestmd %ymm1, %ymm1, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -108,8 +106,7 @@ define   <4 x float> @_inreg4xfloat(floa
 define   <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1) {
 ; CHECK-LABEL: _ss4xfloat_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -122,8 +119,7 @@ define   <4 x float> @_ss4xfloat_mask(<4
 define   <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) {
 ; CHECK-LABEL: _ss4xfloat_maskz:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -146,8 +142,7 @@ define   <4 x double> @_inreg4xdouble(do
 define   <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %mask1) {
 ; CHECK-LABEL: _ss4xdouble_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT:    vptestmd %xmm2, %xmm2, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -160,8 +155,7 @@ define   <4 x double> @_ss4xdouble_mask(
 define   <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) {
 ; CHECK-LABEL: _ss4xdouble_maskz:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
@@ -185,8 +179,7 @@ define <2 x double> @test_v2f64_broadcas
 define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) {
 ; CHECK-LABEL: test_v2f64_broadcast_fold_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpneqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/compress_expand.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/compress_expand.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/compress_expand.ll (original)
+++ llvm/trunk/test/CodeGen/X86/compress_expand.ll Sat Jan 27 12:19:09 2018
@@ -256,7 +256,7 @@ define <2 x float> @test13(float* %base,
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; SKX-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
@@ -265,7 +265,7 @@ define <2 x float> @test13(float* %base,
 ; KNL-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; KNL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; KNL-NEXT:    vpcmpeqq %zmm2, %zmm1, %k0
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k0
 ; KNL-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
@@ -281,7 +281,7 @@ define void @test14(float* %base, <2 x f
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
+; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1
 ; SKX-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
 ; SKX-NEXT:    retq
 ;
@@ -290,7 +290,7 @@ define void @test14(float* %base, <2 x f
 ; KNL-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; KNL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; KNL-NEXT:    vpcmpeqq %zmm2, %zmm1, %k0
+; KNL-NEXT:    vptestnmq %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k0
 ; KNL-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
@@ -303,9 +303,8 @@ define void @test14(float* %base, <2 x f
 define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigger) {
 ; ALL-LABEL: test15:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; ALL-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
-; ALL-NEXT:    vpcmpeqd %zmm4, %zmm2, %k2
+; ALL-NEXT:    vptestnmd %zmm3, %zmm3, %k1
+; ALL-NEXT:    vptestnmd %zmm2, %zmm2, %k2
 ; ALL-NEXT:    kmovw %k2, %eax
 ; ALL-NEXT:    popcntl %eax, %eax
 ; ALL-NEXT:    vexpandps (%rdi,%rax,4), %zmm1 {%k1}
@@ -320,9 +319,8 @@ define <16 x double> @test16(double* %ba
 ; SKX-LABEL: test16:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
-; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k2
+; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1
+; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k2
 ; SKX-NEXT:    kmovb %k2, %eax
 ; SKX-NEXT:    popcntl %eax, %eax
 ; SKX-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
@@ -331,10 +329,9 @@ define <16 x double> @test16(double* %ba
 ;
 ; KNL-LABEL: test16:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; KNL-NEXT:    vpcmpeqd %zmm3, %zmm4, %k1
-; KNL-NEXT:    vpcmpeqd %zmm3, %zmm2, %k2
+; KNL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; KNL-NEXT:    vptestnmd %zmm3, %zmm3, %k1
+; KNL-NEXT:    vptestnmd %zmm2, %zmm2, %k2
 ; KNL-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
 ; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    movzbl %al, %eax
@@ -349,9 +346,8 @@ define <16 x double> @test16(double* %ba
 define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
 ; SKX-LABEL: test17:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
-; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k2
+; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1
+; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k2
 ; SKX-NEXT:    kmovw %k2, %eax
 ; SKX-NEXT:    popcntl %eax, %eax
 ; SKX-NEXT:    vcompressps %zmm1, (%rdi,%rax,4) {%k1}
@@ -361,9 +357,8 @@ define void @test17(float* %base, <32 x
 ;
 ; KNL-LABEL: test17:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
-; KNL-NEXT:    vpcmpeqd %zmm4, %zmm2, %k2
+; KNL-NEXT:    vptestnmd %zmm3, %zmm3, %k1
+; KNL-NEXT:    vptestnmd %zmm2, %zmm2, %k2
 ; KNL-NEXT:    kmovw %k2, %eax
 ; KNL-NEXT:    popcntl %eax, %eax
 ; KNL-NEXT:    vcompressps %zmm1, (%rdi,%rax,4) {%k1}

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Sat Jan 27 12:19:09 2018
@@ -2769,10 +2769,9 @@ define <16 x double> @test_gather_setcc_
 ; KNL_64-LABEL: test_gather_setcc_split:
 ; KNL_64:       # %bb.0:
 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
-; KNL_64-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; KNL_64-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; KNL_64-NEXT:    vpcmpeqd %zmm5, %zmm6, %k1
-; KNL_64-NEXT:    vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_64-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; KNL_64-NEXT:    vptestnmd %zmm5, %zmm5, %k1
+; KNL_64-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; KNL_64-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
 ; KNL_64-NEXT:    vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
 ; KNL_64-NEXT:    vmovapd %zmm2, %zmm0
@@ -2791,10 +2790,9 @@ define <16 x double> @test_gather_setcc_
 ; KNL_32-NEXT:    vmovapd 72(%ebp), %zmm3
 ; KNL_32-NEXT:    movl 8(%ebp), %eax
 ; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
-; KNL_32-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; KNL_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; KNL_32-NEXT:    vpcmpeqd %zmm5, %zmm6, %k1
-; KNL_32-NEXT:    vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; KNL_32-NEXT:    vptestnmd %zmm5, %zmm5, %k1
+; KNL_32-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
 ; KNL_32-NEXT:    vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
 ; KNL_32-NEXT:    vmovapd %zmm2, %zmm0
@@ -2807,9 +2805,8 @@ define <16 x double> @test_gather_setcc_
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
 ; SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
-; SKX-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; SKX-NEXT:    vpcmpeqd %ymm6, %ymm5, %k1
-; SKX-NEXT:    vpcmpeqd %ymm6, %ymm1, %k2
+; SKX-NEXT:    vptestnmd %ymm5, %ymm5, %k1
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; SKX-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
 ; SKX-NEXT:    vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
 ; SKX-NEXT:    vmovapd %zmm2, %zmm0
@@ -2829,9 +2826,8 @@ define <16 x double> @test_gather_setcc_
 ; SKX_32-NEXT:    movl 8(%ebp), %eax
 ; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
 ; SKX_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
-; SKX_32-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; SKX_32-NEXT:    vpcmpeqd %ymm6, %ymm5, %k1
-; SKX_32-NEXT:    vpcmpeqd %ymm6, %ymm1, %k2
+; SKX_32-NEXT:    vptestnmd %ymm5, %ymm5, %k1
+; SKX_32-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; SKX_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
 ; SKX_32-NEXT:    vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
 ; SKX_32-NEXT:    vmovapd %zmm2, %zmm0
@@ -2851,10 +2847,9 @@ define void @test_scatter_setcc_split(do
 ; KNL_64-LABEL: test_scatter_setcc_split:
 ; KNL_64:       # %bb.0:
 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
-; KNL_64-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; KNL_64-NEXT:    vpcmpeqd %zmm5, %zmm1, %k1
+; KNL_64-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_64-NEXT:    vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_64-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; KNL_64-NEXT:    vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2}
 ; KNL_64-NEXT:    vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
 ; KNL_64-NEXT:    vzeroupper
@@ -2872,10 +2867,9 @@ define void @test_scatter_setcc_split(do
 ; KNL_32-NEXT:    vmovapd 72(%ebp), %zmm3
 ; KNL_32-NEXT:    movl 8(%ebp), %eax
 ; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
-; KNL_32-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; KNL_32-NEXT:    vpcmpeqd %zmm5, %zmm1, %k1
+; KNL_32-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL_32-NEXT:    vpcmpeqd %zmm5, %zmm1, %k2
+; KNL_32-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; KNL_32-NEXT:    vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2}
 ; KNL_32-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
 ; KNL_32-NEXT:    movl %ebp, %esp
@@ -2886,10 +2880,9 @@ define void @test_scatter_setcc_split(do
 ; SKX-LABEL: test_scatter_setcc_split:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
-; SKX-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; SKX-NEXT:    vpcmpeqd %ymm5, %ymm1, %k1
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; SKX-NEXT:    vpcmpeqd %ymm5, %ymm1, %k2
+; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; SKX-NEXT:    vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2}
 ; SKX-NEXT:    vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
 ; SKX-NEXT:    vzeroupper
@@ -2907,10 +2900,9 @@ define void @test_scatter_setcc_split(do
 ; SKX_32-NEXT:    vmovapd 72(%ebp), %zmm3
 ; SKX_32-NEXT:    movl 8(%ebp), %eax
 ; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
-; SKX_32-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; SKX_32-NEXT:    vpcmpeqd %ymm5, %ymm1, %k1
+; SKX_32-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; SKX_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; SKX_32-NEXT:    vpcmpeqd %ymm5, %ymm1, %k2
+; SKX_32-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; SKX_32-NEXT:    vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2}
 ; SKX_32-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
 ; SKX_32-NEXT:    movl %ebp, %esp

Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Sat Jan 27 12:19:09 2018
@@ -101,8 +101,7 @@ define <2 x double> @test6(<2 x i64> %tr
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -112,8 +111,7 @@ define <2 x double> @test6(<2 x i64> %tr
 ;
 ; SKX-LABEL: test6:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -134,8 +132,7 @@ define <4 x float> @test7(<4 x i32> %tri
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
@@ -145,8 +142,7 @@ define <4 x float> @test7(<4 x i32> %tri
 ;
 ; SKX-LABEL: test7:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -175,8 +171,7 @@ define <4 x i32> @test8(<4 x i32> %trigg
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
@@ -186,8 +181,7 @@ define <4 x i32> @test8(<4 x i32> %trigg
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -214,8 +208,7 @@ define void @test9(<4 x i32> %trigger, <
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
@@ -224,8 +217,7 @@ define void @test9(<4 x i32> %trigger, <
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -259,8 +251,7 @@ define <4 x double> @test10(<4 x i32> %t
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %ymm1 killed %ymm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -269,8 +260,7 @@ define <4 x double> @test10(<4 x i32> %t
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -301,8 +291,7 @@ define <4 x double> @test10b(<4 x i32> %
 ; AVX512F-LABEL: test10b:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %xmm0 killed %xmm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
@@ -311,8 +300,7 @@ define <4 x double> @test10b(<4 x i32> %
 ;
 ; SKX-LABEL: test10b:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -344,8 +332,7 @@ define <8 x float> @test11a(<8 x i32> %t
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %ymm1 killed %ymm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %ymm0 killed %ymm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
@@ -354,8 +341,7 @@ define <8 x float> @test11a(<8 x i32> %t
 ;
 ; SKX-LABEL: test11a:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; SKX-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -510,8 +496,7 @@ define void @test12(<8 x i32> %trigger,
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def %ymm1 killed %ymm1 def %zmm1
 ; AVX512F-NEXT:    ## kill: def %ymm0 killed %ymm0 def %zmm0
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
@@ -520,8 +505,7 @@ define void @test12(<8 x i32> %trigger,
 ;
 ; SKX-LABEL: test12:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; SKX-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -554,7 +538,7 @@ define void @test14(<2 x i32> %trigger,
 ; AVX512F-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
@@ -565,7 +549,7 @@ define void @test14(<2 x i32> %trigger,
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovups %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -598,10 +582,10 @@ define void @test15(<2 x i32> %trigger,
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -610,7 +594,7 @@ define void @test15(<2 x i32> %trigger,
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -644,7 +628,7 @@ define <2 x float> @test16(<2 x i32> %tr
 ; AVX512F-NEXT:    ## kill: def %xmm1 killed %xmm1 def %zmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
@@ -656,7 +640,7 @@ define <2 x float> @test16(<2 x i32> %tr
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -693,10 +677,10 @@ define <2 x i32> @test17(<2 x i32> %trig
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -706,7 +690,7 @@ define <2 x i32> @test17(<2 x i32> %trig
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    vpmovsxdq %xmm0, %xmm0
@@ -739,7 +723,7 @@ define <2 x float> @test18(<2 x i32> %tr
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
@@ -751,7 +735,7 @@ define <2 x float> @test18(<2 x i32> %tr
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll Sat Jan 27 12:19:09 2018
@@ -1900,8 +1900,7 @@ define <16 x i32> @test_masked_v16i32(i8
 ;
 ; AVX512-LABEL: test_masked_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512-NEXT:    vmovntdqa (%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/pr35272.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr35272.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr35272.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr35272.ll Sat Jan 27 12:19:09 2018
@@ -4,8 +4,7 @@
 define <2 x i48> @PR35272(<2 x i64> %a0, <2 x i48> %a1, <2 x i48> %a2) {
 ; CHECK-LABEL: PR35272:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpblendmq %xmm1, %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %1 = icmp eq <2 x i64> %a0, zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll Sat Jan 27 12:19:09 2018
@@ -7,8 +7,8 @@
 define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
 ; AVX256-LABEL: testv8i1_sext_v8i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
@@ -17,8 +17,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8
 ;
 ; AVX512VL-LABEL: testv8i1_sext_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -42,9 +42,10 @@ define <8 x i16> @testv8i1_sext_v8i16(<8
 define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_sext_v16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
@@ -58,9 +59,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<
 ;
 ; AVX512VL-LABEL: testv16i1_sext_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -70,10 +72,9 @@ define <16 x i8> @testv16i1_sext_v16i8(<
 ; AVX512F-LABEL: testv16i1_sext_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
@@ -91,9 +92,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<
 define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_sext_v16i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm1, %xmm1
@@ -104,9 +106,10 @@ define <16 x i16> @testv16i1_sext_v16i16
 ;
 ; AVX512VL-LABEL: testv16i1_sext_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -115,10 +118,9 @@ define <16 x i16> @testv16i1_sext_v16i16
 ; AVX512F-LABEL: testv16i1_sext_v16i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
@@ -135,8 +137,8 @@ define <16 x i16> @testv16i1_sext_v16i16
 define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
 ; AVX256-LABEL: testv8i1_zext_v8i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX256-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
 ; AVX256-NEXT:    vzeroupper
@@ -144,8 +146,8 @@ define <8 x i16> @testv8i1_zext_v8i16(<8
 ;
 ; AVX512VL-LABEL: testv8i1_zext_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
@@ -168,9 +170,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8
 define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_zext_v16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    movl {{.*}}(%rip), %eax
 ; AVX256-NEXT:    vpbroadcastd %eax, %ymm0 {%k2} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
@@ -185,9 +188,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<
 ;
 ; AVX512VL-LABEL: testv16i1_zext_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -197,10 +201,9 @@ define <16 x i8> @testv16i1_zext_v16i8(<
 ; AVX512F-LABEL: testv16i1_zext_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
@@ -218,9 +221,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<
 define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
 ; AVX256-LABEL: testv16i1_zext_v16i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX256-NEXT:    vptestnmd %ymm0, %ymm0, %k2
 ; AVX256-NEXT:    movl {{.*}}(%rip), %eax
 ; AVX256-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
@@ -231,9 +235,10 @@ define <16 x i16> @testv16i1_zext_v16i16
 ;
 ; AVX512VL-LABEL: testv16i1_zext_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512VL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -242,10 +247,9 @@ define <16 x i16> @testv16i1_zext_v16i16
 ; AVX512F-LABEL: testv16i1_zext_v16i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0

Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll Sat Jan 27 12:19:09 2018
@@ -11,9 +11,10 @@
 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) {
 ; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX256VL:       # %bb.0:
-; AVX256VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256VL-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX256VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX256VL-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; AVX256VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm1, %xmm1
@@ -44,9 +45,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
 ;
 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VL-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -59,9 +61,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
 ;
 ; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX256VLBW:       # %bb.0:
-; AVX256VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX256VLBW-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; AVX256VLBW-NEXT:    vpcmpeqd (%rsi), %ymm0, %k1
+; AVX256VLBW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX256VLBW-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX256VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k0
+; AVX256VLBW-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; AVX256VLBW-NEXT:    vpmovm2w %k1, %ymm0
 ; AVX256VLBW-NEXT:    vpmovm2w %k0, %ymm1
 ; AVX256VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -73,9 +76,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
 ;
 ; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VLBW-NEXT:    vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VLBW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VLBW-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT:    vptestnmd %ymm1, %ymm1, %k2
 ; AVX512VLBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512VLBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -89,9 +93,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm1, %k2
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -106,9 +109,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; AVX512BW-NEXT:    vpcmpeqd %zmm2, %zmm1, %k2
+; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -200,8 +202,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7
 ;
 ; AVX256VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX256VLBW:       # %bb.0:
-; AVX256VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX256VLBW-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0
+; AVX256VLBW-NEXT:    vptestnmb %ymm0, %ymm0, %k0
 ; AVX256VLBW-NEXT:    vpmovm2b %k0, %ymm0
 ; AVX256VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX256VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
@@ -214,8 +215,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7
 ;
 ; AVX512VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0
+; AVX512VLBW-NEXT:    vptestnmb %ymm0, %ymm0, %k0
 ; AVX512VLBW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
 ; AVX512VLBW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
@@ -226,8 +226,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7
 ; AVX512BW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll Sat Jan 27 12:19:09 2018
@@ -370,8 +370,7 @@ define <32 x i16> @testv32i16(<32 x i16>
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm0, %zmm1, %zmm0
@@ -457,8 +456,7 @@ define <32 x i16> @testv32i16u(<32 x i16
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm0, %zmm1, %zmm0

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll Sat Jan 27 12:19:09 2018
@@ -4591,8 +4591,7 @@ define <16 x i16> @PR34369(<16 x i16> %v
 ; AVX512VL-LABEL: PR34369:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12]
-; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; AVX512VL-NEXT:    vptestnmw %ymm1, %ymm1, %k1
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=323605&r1=323604&r2=323605&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Sat Jan 27 12:19:09 2018
@@ -307,8 +307,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_
 ;
 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
 ; VL_BW_DQ:       # %bb.0:
-; VL_BW_DQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; VL_BW_DQ-NEXT:    vpcmpeqw %zmm3, %zmm0, %k0
+; VL_BW_DQ-NEXT:    vptestnmw %zmm0, %zmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
@@ -364,8 +363,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7
 ;
 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
 ; VL_BW_DQ:       # %bb.0:
-; VL_BW_DQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; VL_BW_DQ-NEXT:    vpcmpeqb %ymm3, %ymm0, %k0
+; VL_BW_DQ-NEXT:    vptestnmb %ymm0, %ymm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
@@ -381,9 +379,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7
 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT:    vpcmpeqd %zmm6, %zmm0, %k1
-; AVX512F-NEXT:    vpcmpeqd %zmm6, %zmm1, %k2
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -397,9 +394,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_
 ;
 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT:    vpcmpeqd %zmm6, %zmm0, %k1
-; AVX512VL-NEXT:    vpcmpeqd %zmm6, %zmm1, %k2
+; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -413,9 +409,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_
 ;
 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
 ; VL_BW_DQ:       # %bb.0:
-; VL_BW_DQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; VL_BW_DQ-NEXT:    vpcmpeqd %zmm4, %zmm0, %k0
-; VL_BW_DQ-NEXT:    vpcmpeqd %zmm4, %zmm1, %k1
+; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -434,9 +429,8 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_
 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT:    vpcmpeqd %zmm4, %zmm0, %k1
-; AVX512F-NEXT:    vpcmpeqd %zmm4, %zmm1, %k2
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -450,9 +444,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7
 ;
 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpcmpeqd %zmm4, %zmm0, %k1
-; AVX512VL-NEXT:    vpcmpeqd %zmm4, %zmm1, %k2
+; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
@@ -466,9 +459,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7
 ;
 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
 ; VL_BW_DQ:       # %bb.0:
-; VL_BW_DQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; VL_BW_DQ-NEXT:    vpcmpeqd %zmm4, %zmm0, %k0
-; VL_BW_DQ-NEXT:    vpcmpeqd %zmm4, %zmm1, %k1
+; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]




More information about the llvm-commits mailing list