[llvm] r315848 - [X86] Don't use constant condition for select instruction when testing masking ops.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 14 23:05:50 PDT 2017


Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=315848&r1=315847&r2=315848&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Sat Oct 14 23:05:50 2017
@@ -16,109 +16,115 @@ define <8 x i16> @test_16xi16_to_8xi16_p
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
-; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
-; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; CHECK-NEXT:    movb $-63, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec) {
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
-; CHECK-NEXT:    movb $-63, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7]
-; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5,6],xmm3[7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec) {
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6],xmm1[7]
-; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
@@ -133,37 +139,39 @@ define <8 x i16> @test_16xi16_to_8xi16_p
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
-; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7]
-; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
@@ -181,119 +189,125 @@ define <8 x i16> @test_16xi16_to_8xi16_p
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,0]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6],xmm2[7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,0]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6],xmm1[7]
-; CHECK-NEXT:    movb $-73, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
-; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,0]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
-; CHECK-NEXT:    movb $-73, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2) {
-; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
-; CHECK-NEXT:    movb $102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp) {
-; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3,4,5],xmm1[6,7]
-; CHECK-NEXT:    movb $102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2) {
-; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
 ; CHECK-NEXT:    vpsrld $16, %xmm1, %xmm2
 ; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; CHECK-NEXT:    movb $-46, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp) {
-; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; CHECK-NEXT:    movb $-46, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
@@ -310,39 +324,41 @@ define <8 x i16> @test_16xi16_to_8xi16_p
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
-; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
@@ -357,91 +373,97 @@ define <16 x i16> @test_32xi16_to_16xi16
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movw $-25378, %ax # imm = 0x9CDE
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
-; CHECK-NEXT:    movw $-25378, %ax # imm = 0x9CDE
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movw $-22502, %ax # imm = 0xA81A
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
-; CHECK-NEXT:    movw $-22502, %ax # imm = 0xA81A
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movw $31229, %ax # imm = 0x79FD
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
-; CHECK-NEXT:    movw $31229, %ax # imm = 0x79FD
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
@@ -455,33 +477,35 @@ define <16 x i16> @test_32xi16_to_16xi16
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movw $5887, %ax # imm = 0x16FF
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    movw $5887, %ax # imm = 0x16FF
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
@@ -496,97 +520,103 @@ define <8 x i16> @test_32xi16_to_8xi16_p
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movb $-128, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; CHECK-NEXT:    movb $-128, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
@@ -601,35 +631,37 @@ define <8 x i16> @test_32xi16_to_8xi16_p
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmw %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
@@ -644,102 +676,111 @@ define <16 x i16> @test_32xi16_to_16xi16
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movw $23083, %ax # imm = 0x5A2B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
-; CHECK-NEXT:    movw $23083, %ax # imm = 0x5A2B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movw $18866, %ax # imm = 0x49B2
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
-; CHECK-NEXT:    movw $18866, %ax # imm = 0x49B2
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movw $23540, %ax # imm = 0x5BF4
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
-; CHECK-NEXT:    movw $23540, %ax # imm = 0x5BF4
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
@@ -755,36 +796,39 @@ define <16 x i16> @test_32xi16_to_16xi16
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movw $-3481, %ax # imm = 0xF267
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
-; CHECK-NEXT:    movw $-3481, %ax # imm = 0xF267
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
@@ -802,111 +846,117 @@ define <8 x i16> @test_32xi16_to_8xi16_p
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movb $-90, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; CHECK-NEXT:    movb $-90, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movb $89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; CHECK-NEXT:    movb $89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $-34, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $-34, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
@@ -924,39 +974,41 @@ define <8 x i16> @test_32xi16_to_8xi16_p
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
   ret <8 x i16> %res
 }
-define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2) {
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $71, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $71, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 
@@ -971,97 +1023,103 @@ define <4 x i32> @test_8xi32_to_4xi32_pe
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
@@ -1075,34 +1133,36 @@ define <4 x i32> @test_8xi32_to_4xi32_pe
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
@@ -1117,105 +1177,111 @@ define <4 x i32> @test_8xi32_to_4xi32_pe
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm1[1,0,0,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,0,3]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,3,3,0]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,3,0]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
@@ -1233,41 +1299,43 @@ define <4 x i32> @test_8xi32_to_4xi32_pe
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
@@ -1282,91 +1350,97 @@ define <8 x i32> @test_16xi32_to_8xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [9,5,3,6,15,2,9,14]
-; CHECK-NEXT:    vpermi2d %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movb $67, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
+; CHECK-NEXT:    vpermi2d %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,5,3,6,15,2,9,14]
-; CHECK-NEXT:    movb $67, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm0, %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $-58, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,0,15,3,2,3,6,8]
-; CHECK-NEXT:    movb $-58, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $110, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,15,15,2,6,10,14,7]
-; CHECK-NEXT:    movb $110, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
@@ -1380,33 +1454,35 @@ define <8 x i32> @test_16xi32_to_8xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
-; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
@@ -1423,101 +1499,107 @@ define <4 x i32> @test_16xi32_to_4xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7]
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7]
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = <5,1,3,4,u,u,u,u>
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,1,13,0,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
@@ -1532,35 +1614,37 @@ define <4 x i32> @test_16xi32_to_4xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,0,0,13,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
@@ -1573,97 +1657,105 @@ define <8 x i32> @test_16xi32_to_8xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
-; CHECK-NEXT:    movb $84, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd 32(%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
-; CHECK-NEXT:    movb $84, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd 32(%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movb $41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15]
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,3,6,11,0,1,5,15]
-; CHECK-NEXT:    movb $41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movb $38, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10]
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,14,1,5,4,2,8,10]
-; CHECK-NEXT:    movb $38, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
@@ -1679,36 +1771,39 @@ define <8 x i32> @test_16xi32_to_8xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12]
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
-; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
@@ -1726,115 +1821,121 @@ define <4 x i32> @test_16xi32_to_4xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[3,1,2,3,7,5,6,7]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
@@ -1856,49 +1957,51 @@ define <4 x i32> @test_16xi32_to_4xi32_p
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
   ret <4 x i32> %res
 }
-define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2) {
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vmovd %xmm1, %eax
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
-; CHECK-NEXT:    vpextrd $3, %xmm2, %eax
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm2
-; CHECK-NEXT:    vpextrd $2, %xmm1, %eax
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT:    vmovd %xmm2, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
+; CHECK-NEXT:    vpextrd $3, %xmm3, %eax
+; CHECK-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm3
+; CHECK-NEXT:    vpextrd $2, %xmm2, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0
-; CHECK-NEXT:    vmovd %xmm0, %eax
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrd $3, %xmm1, %eax
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm1
-; CHECK-NEXT:    vpextrd $2, %xmm0, %eax
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT:    vmovd %xmm1, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
+; CHECK-NEXT:    vpextrd $3, %xmm2, %eax
+; CHECK-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm2
+; CHECK-NEXT:    vpextrd $2, %xmm1, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
@@ -1912,61 +2015,65 @@ define <2 x i64> @test_4xi64_to_2xi64_pe
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
   ret <2 x i64> %res
 }
-define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm0[0]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
-define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec) {
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
@@ -1981,69 +2088,73 @@ define <2 x i64> @test_4xi64_to_2xi64_pe
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %res
 }
-define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm3[1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp) {
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
 
@@ -2056,86 +2167,92 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
@@ -2148,94 +2265,100 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[3,1,2,3]
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,1,2,3]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3]
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
@@ -2248,64 +2371,68 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,2,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,0,3,3]
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,0,3,3]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3]
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
@@ -2319,63 +2446,67 @@ define <2 x i64> @test_8xi64_to_2xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
   ret <2 x i64> %res
 }
-define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm0[0]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
-define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec) {
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
@@ -2387,97 +2518,103 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[0,3,2,0]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,2,0]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,1,2,1]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -2493,103 +2630,109 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,2]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[3,0,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,2]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[0,2,3,1]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,1]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -2606,75 +2749,79 @@ define <4 x i64> @test_8xi64_to_4xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,1,2,3]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,3,2]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,2]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm3
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,1,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp) {
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -2691,80 +2838,84 @@ define <2 x i64> @test_8xi64_to_2xi64_pe
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
   ret <2 x i64> %res
 }
-define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2) {
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti32x4 $2, %zmm2, %xmm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm2[0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; CHECK-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm0, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
-; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x i64> %shuf, <2 x i64> zeroinitializer
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2) {
-; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> %vec2
+  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp) {
-; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x i64> %shuf, <2 x i64> zeroinitializer
-  ret <2 x i64> %res
-}
-
-define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
-; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
+define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
@@ -2773,92 +2924,98 @@ define <4 x float> @test_8xfloat_to_4xfl
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm2[0,1]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm1[0,1]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],xmm2[0,2]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],xmm3[0,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm1[0,2]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm2[0,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[0,0]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,2],xmm2[0,2]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[0,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,2],xmm3[0,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2],xmm1[0,2]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[0,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2],xmm2[0,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
@@ -2872,34 +3029,36 @@ define <4 x float> @test_8xfloat_to_4xfl
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 {%k1} = xmm0[3,3,1,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,1,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
@@ -2915,105 +3074,111 @@ define <4 x float> @test_8xfloat_to_4xfl
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[2,0],xmm2[0,1]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,0],xmm1[0,1]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = xmm1[2,3,3,2]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,2]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[3,1],xmm2[2,0]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[3,0]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,1],xmm1[2,0]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
@@ -3030,37 +3195,39 @@ define <4 x float> @test_8xfloat_to_4xfl
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[1,3],xmm2[0,2]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm1[0,2]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
@@ -3075,95 +3242,101 @@ define <8 x float> @test_16xfloat_to_8xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $52, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmps %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
-; CHECK-NEXT:    movb $52, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [2,4,11,4,12,7,9,6]
-; CHECK-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [2,4,11,4,12,7,9,6]
+; CHECK-NEXT:    vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmps %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [2,4,11,4,12,7,9,6]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,4,11,4,12,7,9,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4>
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm2
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = <0,4,u,u,6,1,4,4>
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm3
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3]
-; CHECK-NEXT:    movb $-78, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = <0,4,u,u,6,1,4,4>
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4>
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm2
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
-; CHECK-NEXT:    movb $-78, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
@@ -3177,33 +3350,35 @@ define <8 x float> @test_16xfloat_to_8xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,6,1,8,4,12,13,0]
-; CHECK-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [4,6,1,8,4,12,13,0]
+; CHECK-NEXT:    vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmps %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,6,1,8,4,12,13,0]
-; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,6,1,8,4,12,13,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
@@ -3218,98 +3393,104 @@ define <4 x float> @test_16xfloat_to_4xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm3
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %xmm3, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = <12,0,1,2,u,u,u,u>
+; CHECK-NEXT:    vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqps %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vblendmps %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = <12,0,1,2,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u>
+; CHECK-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqps %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[0,1],ymm1[4,4],ymm0[4,5]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
@@ -3325,39 +3506,41 @@ define <4 x float> @test_16xfloat_to_4xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[2,1,3,3]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
@@ -3372,104 +3555,113 @@ define <8 x float> @test_16xfloat_to_8xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $-105, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
-; CHECK-NEXT:    movb $-105, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $36, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [11,0,9,0,7,14,0,8]
-; CHECK-NEXT:    movb $36, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[1,0,0,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [8,5,2,3,2,9,10,1]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[1,0,0,3]
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [8,5,2,3,2,9,10,1]
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,0,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [8,5,2,3,2,9,10,1]
-; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[1,0,0,3]
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [8,5,2,3,2,9,10,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -3485,36 +3677,39 @@ define <8 x float> @test_16xfloat_to_8xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movb $90, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovaps %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    movb $90, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -3533,119 +3728,125 @@ define <4 x float> @test_16xfloat_to_4xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3]
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[2,3],ymm3[4,6],ymm2[6,7]
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[2,3],ymm1[4,6],ymm0[6,7]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4]
-; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
+; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,3],ymm2[6,4],ymm3[6,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[0,0],ymm0[6,4],ymm1[4,4]
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4]
+; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
@@ -3663,39 +3864,41 @@ define <4 x float> @test_16xfloat_to_4xf
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
   ret <4 x float> %res
 }
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2) {
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec2
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
   ret <4 x float> %res
 }
 
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT:    vpermi2ps %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
@@ -3709,60 +3912,64 @@ define <2 x double> @test_4xdouble_to_2x
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
   ret <2 x double> %res
 }
-define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2) {
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm2[0],xmm0[0]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm3[0],xmm0[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm0[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
-define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2) {
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm2[1]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[1]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec) {
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[1]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
@@ -3777,69 +3984,73 @@ define <2 x double> @test_4xdouble_to_2x
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
   ret <2 x double> %res
 }
-define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2) {
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2) {
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm1[0]
+; CHECK-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp) {
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 
@@ -3853,85 +4064,91 @@ define <4 x double> @test_8xdouble_to_4x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,0,7,6]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [2,0,7,6]
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [2,0,7,6]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [2,0,7,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
@@ -3945,89 +4162,95 @@ define <4 x double> @test_8xdouble_to_4x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [0,2,1,4]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm3
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [0,2,1,4]
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,1,4]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,1,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
@@ -4040,60 +4263,64 @@ define <4 x double> @test_8xdouble_to_4x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,1,0,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,1,0,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
@@ -4108,66 +4335,70 @@ define <2 x double> @test_8xdouble_to_2x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
   ret <2 x double> %res
 }
-define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2) {
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
-define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2) {
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec) {
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
@@ -4182,100 +4413,107 @@ define <4 x double> @test_8xdouble_to_4x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,6,7,2]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm1, %ymm3
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,6,7,2]
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [1,6,7,2]
-; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [1,6,7,2]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
-  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[3,0,2,0]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,0]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,2,3,0]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[1,2,3,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,0]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1,2,3,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -4291,102 +4529,109 @@ define <4 x double> @test_8xdouble_to_4x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,0]
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,0]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0]
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,4,1,5]
-; CHECK-NEXT:    vpermi2pd %ymm1, %ymm2, %ymm3
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [2,4,1,5]
+; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovapd %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [2,4,1,5]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermi2pd %ymm1, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [2,4,1,5]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[2,1,1,1]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,1,1]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -4403,71 +4648,75 @@ define <4 x double> @test_8xdouble_to_4x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm3
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1]
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[0,1,2,1]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp) {
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -4485,71 +4734,75 @@ define <2 x double> @test_8xdouble_to_2x
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
   ret <2 x double> %res
 }
-define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2) {
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2) {
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[0]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm2
+; CHECK-NEXT:    vextractf32x4 $2, %zmm2, %xmm3
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec2
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
   ret <2 x double> %res
 }
 
-define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp) {
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1
+; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[0]
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll?rev=315848&r1=315847&r2=315848&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll Sat Oct 14 23:05:50 2017
@@ -10,82 +10,88 @@ define <16 x i16> @test_16xi16_perm_mask
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec) {
+define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec) {
+define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec) {
+define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
@@ -97,30 +103,32 @@ define <16 x i16> @test_16xi16_perm_mask
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec) {
+define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
@@ -133,87 +141,93 @@ define <16 x i16> @test_16xi16_perm_mem_
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp) {
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp) {
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
@@ -227,31 +241,33 @@ define <16 x i16> @test_16xi16_perm_mem_
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   ret <16 x i16> %res
 }
-define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2) {
+define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 
@@ -264,82 +280,88 @@ define <32 x i16> @test_32xi16_perm_mask
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   ret <32 x i16> %res
 }
-define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
-; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
-  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec) {
+define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
-; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
-  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
-define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
-; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
-  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec) {
+define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
-; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
-  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
-define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
-; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
-  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec) {
+define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
-; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
-  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
@@ -351,30 +373,32 @@ define <32 x i16> @test_32xi16_perm_mask
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   ret <32 x i16> %res
 }
-define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
-; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
-  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec) {
+define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
-; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
-  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
@@ -387,87 +411,93 @@ define <32 x i16> @test_32xi16_perm_mem_
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   ret <32 x i16> %res
 }
-define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
-; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
-  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
-; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
-  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
-; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
-  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp) {
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
-; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
-  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
-; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
-  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp) {
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
-; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
-  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 
@@ -481,31 +511,33 @@ define <32 x i16> @test_32xi16_perm_mem_
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   ret <32 x i16> %res
 }
-define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2) {
+define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
-; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
-  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
-; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
-  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 
@@ -518,82 +550,88 @@ define <8 x i32> @test_8xi32_perm_mask0(
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
-; CHECK-NEXT:    movb $-53, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec) {
+define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6]
-; CHECK-NEXT:    movb $-53, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
-; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec) {
+define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,1,2,6,0,0,3]
-; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec) {
+define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,5,5,1,7,3,4]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
@@ -605,30 +643,32 @@ define <8 x i32> @test_8xi32_perm_mask3(
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
-; CHECK-NEXT:    movb $47, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec) {
+define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0]
-; CHECK-NEXT:    movb $47, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
@@ -641,87 +681,93 @@ define <8 x i32> @test_8xi32_perm_mem_ma
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
-; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5]
-; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
-; CHECK-NEXT:    movb $89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp) {
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,6,1,7,6,7,6,5]
-; CHECK-NEXT:    movb $89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
-; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp) {
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,4,6,1,6,3,6,3]
-; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
@@ -735,31 +781,33 @@ define <8 x i32> @test_8xi32_perm_mem_ma
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2) {
+define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
-; CHECK-NEXT:    movb $-58, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5]
-; CHECK-NEXT:    movb $-58, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
@@ -772,82 +820,88 @@ define <16 x i32> @test_16xi32_perm_mask
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
-; CHECK-NEXT:    movw $-28063, %ax # imm = 0x9261
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec) {
+define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
-; CHECK-NEXT:    movw $-28063, %ax # imm = 0x9261
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
-define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
-; CHECK-NEXT:    movw $14154, %ax # imm = 0x374A
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec) {
+define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
-; CHECK-NEXT:    movw $14154, %ax # imm = 0x374A
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
-define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
-; CHECK-NEXT:    movw $6126, %ax # imm = 0x17EE
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec) {
+define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
-; CHECK-NEXT:    movw $6126, %ax # imm = 0x17EE
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
@@ -859,30 +913,32 @@ define <16 x i32> @test_16xi32_perm_mask
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
-; CHECK-NEXT:    movw $-11837, %ax # imm = 0xD1C3
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec) {
+define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
-; CHECK-NEXT:    movw $-11837, %ax # imm = 0xD1C3
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
@@ -895,87 +951,93 @@ define <16 x i32> @test_16xi32_perm_mem_
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
-; CHECK-NEXT:    movw $19075, %ax # imm = 0x4A83
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
-; CHECK-NEXT:    movw $19075, %ax # imm = 0x4A83
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
-; CHECK-NEXT:    movw $27511, %ax # imm = 0x6B77
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp) {
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
-; CHECK-NEXT:    movw $27511, %ax # imm = 0x6B77
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
-; CHECK-NEXT:    movw $3032, %ax # imm = 0xBD8
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp) {
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
-; CHECK-NEXT:    movw $3032, %ax # imm = 0xBD8
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
@@ -989,31 +1051,33 @@ define <16 x i32> @test_16xi32_perm_mem_
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2) {
+define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
-; CHECK-NEXT:    movw $8666, %ax # imm = 0x21DA
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
-; CHECK-NEXT:    movw $8666, %ax # imm = 0x21DA
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
@@ -1025,76 +1089,82 @@ define <4 x i64> @test_4xi64_perm_mask0(
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec) {
+define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec) {
+define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec) {
+define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
@@ -1105,28 +1175,30 @@ define <4 x i64> @test_4xi64_perm_mask3(
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec) {
+define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
@@ -1138,81 +1210,87 @@ define <4 x i64> @test_4xi64_perm_mem_ma
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp) {
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp) {
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -1225,29 +1303,31 @@ define <4 x i64> @test_4xi64_perm_mem_ma
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2) {
+define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -1260,80 +1340,86 @@ define <8 x i64> @test_8xi64_perm_mask0(
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-122, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-122, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
-; CHECK-NEXT:    movb $17, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,3,7,3,3,5,4,1]
-; CHECK-NEXT:    movb $17, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
@@ -1344,78 +1430,84 @@ define <8 x i64> @test_8xi64_perm_imm_ma
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
-; CHECK-NEXT:    movb $-81, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,3,1,1,7,4,0,3]
-; CHECK-NEXT:    movb $-81, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-67, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-67, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
@@ -1427,54 +1519,58 @@ define <8 x i64> @test_8xi64_perm_mask6(
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
-; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7]
-; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
@@ -1487,85 +1583,91 @@ define <8 x i64> @test_8xi64_perm_mem_ma
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
-; CHECK-NEXT:    movb $-108, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3]
-; CHECK-NEXT:    movb $-108, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $125, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $125, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
-; CHECK-NEXT:    movb $-77, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,1,4,1,1,5,5]
-; CHECK-NEXT:    movb $-77, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
@@ -1578,83 +1680,89 @@ define <8 x i64> @test_8xi64_perm_imm_me
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $55, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $55, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
-; CHECK-NEXT:    movb $68, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,0,7,0,3,5,0,6]
-; CHECK-NEXT:    movb $68, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
@@ -1668,57 +1776,61 @@ define <8 x i64> @test_8xi64_perm_mem_ma
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
-; CHECK-NEXT:    movb $42, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6]
-; CHECK-NEXT:    movb $42, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2) {
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp) {
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
@@ -1731,82 +1843,88 @@ define <8 x float> @test_8xfloat_perm_ma
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
-; CHECK-NEXT:    movb $33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec) {
+define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4]
-; CHECK-NEXT:    movb $33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
-; CHECK-NEXT:    movb $-34, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec) {
+define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,1,0,6,0,5,1]
-; CHECK-NEXT:    movb $-34, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
-; CHECK-NEXT:    movb $-18, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec) {
+define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,5,5,4,6,0,5]
-; CHECK-NEXT:    movb $-18, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
@@ -1818,30 +1936,32 @@ define <8 x float> @test_8xfloat_perm_ma
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
-; CHECK-NEXT:    movb $82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec) {
+define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6]
-; CHECK-NEXT:    movb $82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
@@ -1854,87 +1974,93 @@ define <8 x float> @test_8xfloat_perm_me
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
-; CHECK-NEXT:    movb $61, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0]
-; CHECK-NEXT:    movb $61, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
-; CHECK-NEXT:    movb $-124, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp) {
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [1,3,7,4,0,6,6,6]
-; CHECK-NEXT:    movb $-124, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
-; CHECK-NEXT:    movb $-84, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp) {
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [4,5,1,5,6,6,2,4]
-; CHECK-NEXT:    movb $-84, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -1948,31 +2074,33 @@ define <8 x float> @test_8xfloat_perm_me
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   ret <8 x float> %res
 }
-define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2) {
+define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
-; CHECK-NEXT:    movb $60, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
 }
 
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0]
-; CHECK-NEXT:    movb $60, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -1985,82 +2113,88 @@ define <16 x float> @test_16xfloat_perm_
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   ret <16 x float> %res
 }
-define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
-; CHECK-NEXT:    movw $14423, %ax # imm = 0x3857
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec) {
+define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
-; CHECK-NEXT:    movw $14423, %ax # imm = 0x3857
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
-define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
-; CHECK-NEXT:    movw $-22757, %ax # imm = 0xA71B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec) {
+define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
-; CHECK-NEXT:    movw $-22757, %ax # imm = 0xA71B
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
-define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
-; CHECK-NEXT:    movw $-22227, %ax # imm = 0xA92D
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec) {
+define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
-; CHECK-NEXT:    movw $-22227, %ax # imm = 0xA92D
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
@@ -2072,30 +2206,32 @@ define <16 x float> @test_16xfloat_perm_
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   ret <16 x float> %res
 }
-define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
-; CHECK-NEXT:    movw $32420, %ax # imm = 0x7EA4
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec) {
+define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
-; CHECK-NEXT:    movw $32420, %ax # imm = 0x7EA4
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
@@ -2108,87 +2244,93 @@ define <16 x float> @test_16xfloat_perm_
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   ret <16 x float> %res
 }
-define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
-; CHECK-NEXT:    movw $1441, %ax # imm = 0x5A1
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
-; CHECK-NEXT:    movw $1441, %ax # imm = 0x5A1
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
-; CHECK-NEXT:    movw $-12684, %ax # imm = 0xCE74
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp) {
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
-; CHECK-NEXT:    movw $-12684, %ax # imm = 0xCE74
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
-; CHECK-NEXT:    movw $11066, %ax # imm = 0x2B3A
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp) {
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
-; CHECK-NEXT:    movw $11066, %ax # imm = 0x2B3A
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
@@ -2202,31 +2344,33 @@ define <16 x float> @test_16xfloat_perm_
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   ret <16 x float> %res
 }
-define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2) {
+define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
-; CHECK-NEXT:    movw $-13916, %ax # imm = 0xC9A4
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
 }
 
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
-; CHECK-NEXT:    movw $-13916, %ax # imm = 0xC9A4
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
@@ -2238,76 +2382,82 @@ define <4 x double> @test_4xdouble_perm_
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec) {
+define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec) {
+define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec) {
+define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
@@ -2318,28 +2468,30 @@ define <4 x double> @test_4xdouble_perm_
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec) {
+define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
@@ -2351,81 +2503,87 @@ define <4 x double> @test_4xdouble_perm_
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp) {
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp) {
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -2438,29 +2596,31 @@ define <4 x double> @test_4xdouble_perm_
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   ret <4 x double> %res
 }
-define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2) {
+define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
 }
 
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -2473,80 +2633,86 @@ define <8 x double> @test_8xdouble_perm_
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
-; CHECK-NEXT:    movb $-115, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4]
-; CHECK-NEXT:    movb $-115, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
-; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [7,5,5,5,3,5,1,7]
-; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
@@ -2557,78 +2723,84 @@ define <8 x double> @test_8xdouble_perm_
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-57, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-57, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
-; CHECK-NEXT:    movb $-54, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [3,5,3,4,6,5,7,1]
-; CHECK-NEXT:    movb $-54, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
@@ -2640,54 +2812,58 @@ define <8 x double> @test_8xdouble_perm_
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
-; CHECK-NEXT:    movb $-65, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2]
-; CHECK-NEXT:    movb $-65, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $40, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $40, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
@@ -2700,85 +2876,91 @@ define <8 x double> @test_8xdouble_perm_
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
-; CHECK-NEXT:    movb $99, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1]
-; CHECK-NEXT:    movb $99, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-32, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-32, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [6,7,2,7,7,6,2,5]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
@@ -2791,83 +2973,89 @@ define <8 x double> @test_8xdouble_perm_
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $119, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $119, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
-; CHECK-NEXT:    movb $-45, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [1,1,3,5,6,0,6,0]
-; CHECK-NEXT:    movb $-45, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
@@ -2881,57 +3069,61 @@ define <8 x double> @test_8xdouble_perm_
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   ret <8 x double> %res
 }
-define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
-; CHECK-NEXT:    movb $-75, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5]
-; CHECK-NEXT:    movb $-75, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2) {
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $84, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
 }
 
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp) {
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $84, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll?rev=315848&r1=315847&r2=315848&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll Sat Oct 14 23:05:50 2017
@@ -9,76 +9,82 @@ define <4 x float> @test_4xfloat_shuff_m
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
   ret <4 x float> %res
 }
-define <4 x float> @test_4xfloat_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[2,1],xmm1[3,1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],xmm1[3,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
-define <4 x float> @test_4xfloat_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,2],xmm1[3,2]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2],xmm1[3,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
-define <4 x float> @test_4xfloat_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,3],xmm1[2,1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm1[2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2) {
@@ -89,28 +95,30 @@ define <4 x float> @test_4xfloat_shuff_m
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
   ret <4 x float> %res
 }
-define <4 x float> @test_4xfloat_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[3,3],xmm1[3,3]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],xmm1[3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
@@ -122,84 +130,90 @@ define <4 x float> @test_4xfloat_shuff_m
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
   ret <4 x float> %res
 }
-define <4 x float> @test_4xfloat_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,0],mem[1,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0],mem[1,2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,3],mem[1,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],mem[1,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],mem[2,0]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 4>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],mem[2,0]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 4>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
@@ -212,30 +226,32 @@ define <4 x float> @test_4xfloat_shuff_m
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6>
   ret <4 x float> %res
 }
-define <4 x float> @test_4xfloat_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[2,1],mem[3,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
 }
 
-define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],mem[3,2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 
@@ -247,76 +263,82 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
@@ -327,28 +349,30 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
@@ -360,84 +384,90 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-106, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-106, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 2, i32 9, i32 8, i32 6, i32 6, i32 13, i32 12>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 2, i32 9, i32 8, i32 6, i32 6, i32 13, i32 12>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 11, i32 11, i32 7, i32 7, i32 15, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 11, i32 11, i32 7, i32 7, i32 15, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -450,30 +480,32 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -485,76 +517,82 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-19315, %ax # imm = 0xB48D
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-19315, %ax # imm = 0xB48D
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $18064, %ax # imm = 0x4690
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $18064, %ax # imm = 0x4690
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-12346, %ax # imm = 0xCFC6
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-12346, %ax # imm = 0xCFC6
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
@@ -565,28 +603,30 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-9865, %ax # imm = 0xD977
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-9865, %ax # imm = 0xD977
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
@@ -598,84 +638,90 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $7677, %ax # imm = 0x1DFD
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $7677, %ax # imm = 0x1DFD
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30>
-  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $14448, %ax # imm = 0x3870
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 2, i32 19, i32 18, i32 4, i32 6, i32 23, i32 22, i32 8, i32 10, i32 27, i32 26, i32 12, i32 14, i32 31, i32 30>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $14448, %ax # imm = 0x3870
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 2, i32 19, i32 18, i32 4, i32 6, i32 23, i32 22, i32 8, i32 10, i32 27, i32 26, i32 12, i32 14, i32 31, i32 30>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-13463, %ax # imm = 0xCB69
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 0, i32 18, i32 18, i32 6, i32 4, i32 22, i32 22, i32 10, i32 8, i32 26, i32 26, i32 14, i32 12, i32 30, i32 30>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-13463, %ax # imm = 0xCB69
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 0, i32 18, i32 18, i32 6, i32 4, i32 22, i32 22, i32 10, i32 8, i32 26, i32 26, i32 14, i32 12, i32 30, i32 30>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
@@ -688,30 +734,32 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $21793, %ax # imm = 0x5521
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $21793, %ax # imm = 0x5521
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
@@ -723,52 +771,56 @@ define <2 x double> @test_2xdouble_shuff
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %res
 }
-define <2 x double> @test_2xdouble_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3) {
+define <2 x double> @test_2xdouble_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
 }
 
-define <2 x double> @test_2xdouble_zero_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+define <2 x double> @test_2xdouble_zero_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
-define <2 x double> @test_2xdouble_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3) {
+define <2 x double> @test_2xdouble_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
 }
 
-define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2) {
+define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
@@ -780,57 +832,61 @@ define <2 x double> @test_2xdouble_shuff
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %res
 }
-define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3) {
+define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
 }
 
-define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 
-define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3) {
+define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
 }
 
-define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p) {
+define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
-  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
 }
 
@@ -842,76 +898,82 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
@@ -922,28 +984,30 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
@@ -955,84 +1019,90 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 5, i32 2, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 5, i32 2, i32 6>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[3],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[3],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -1045,30 +1115,32 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -1080,76 +1152,82 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-77, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-77, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-87, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-87, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
@@ -1160,28 +1238,30 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
@@ -1193,84 +1273,90 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-7, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 4, i32 12, i32 7, i32 15>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-7, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 4, i32 12, i32 7, i32 15>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 10, i32 5, i32 13, i32 7, i32 15>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 10, i32 5, i32 13, i32 7, i32 15>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
@@ -1283,30 +1369,32 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-39, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-39, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll?rev=315848&r1=315847&r2=315848&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll Sat Oct 14 23:05:50 2017
@@ -11,79 +11,85 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $-63, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $-63, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
@@ -94,29 +100,31 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
@@ -128,87 +136,93 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -221,31 +235,33 @@ define <8 x float> @test_8xfloat_shuff_m
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $-26, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
 }
 
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $-26, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 
@@ -257,76 +273,82 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $75, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $75, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
@@ -337,28 +359,30 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
@@ -370,84 +394,90 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
@@ -460,30 +490,32 @@ define <16 x float> @test_16xfloat_shuff
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   ret <16 x float> %res
 }
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
 }
 
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 
@@ -495,79 +527,85 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
@@ -578,29 +616,31 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
@@ -612,87 +652,93 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -705,31 +751,33 @@ define <4 x double> @test_4xdouble_shuff
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
 }
 
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 
@@ -741,76 +789,82 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $30, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $30, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
@@ -821,28 +875,30 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $56, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $56, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
@@ -854,84 +910,90 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $95, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $95, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
@@ -944,30 +1006,32 @@ define <8 x double> @test_8xdouble_shuff
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   ret <8 x double> %res
 }
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
 }
 
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 
@@ -979,79 +1043,85 @@ define <8 x i32> @test_8xi32_shuff_mask0
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
-define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
-define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $51, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $51, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
@@ -1062,29 +1132,31 @@ define <8 x i32> @test_8xi32_shuff_mask3
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
@@ -1096,87 +1168,93 @@ define <8 x i32> @test_8xi32_shuff_mem_m
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $113, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $113, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
@@ -1189,31 +1267,33 @@ define <8 x i32> @test_8xi32_shuff_mem_m
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i32> %res
 }
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $45, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $45, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
@@ -1225,76 +1305,82 @@ define <16 x i32> @test_16xi32_shuff_mas
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
-define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
-define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
@@ -1305,28 +1391,30 @@ define <16 x i32> @test_16xi32_shuff_mas
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
@@ -1338,84 +1426,90 @@ define <16 x i32> @test_16xi32_shuff_mem
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
@@ -1428,30 +1522,32 @@ define <16 x i32> @test_16xi32_shuff_mem
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i32> %res
 }
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
@@ -1463,79 +1559,85 @@ define <4 x i64> @test_4xi64_shuff_mask0
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
-define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
@@ -1546,29 +1648,31 @@ define <4 x i64> @test_4xi64_shuff_mask3
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
@@ -1580,87 +1684,93 @@ define <4 x i64> @test_4xi64_shuff_mem_m
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
-; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -1673,31 +1783,33 @@ define <4 x i64> @test_4xi64_shuff_mem_m
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 
@@ -1709,76 +1821,82 @@ define <8 x i64> @test_8xi64_shuff_mask0
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-15, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-15, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
-  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-17, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-17, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
-  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
-define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
-  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
@@ -1789,28 +1907,30 @@ define <8 x i64> @test_8xi64_shuff_mask3
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
@@ -1822,84 +1942,90 @@ define <8 x i64> @test_8xi64_shuff_mem_m
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $42, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $42, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 
@@ -1912,30 +2038,32 @@ define <8 x i64> @test_8xi64_shuff_mem_m
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   ret <8 x i64> %res
 }
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
-  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 




More information about the llvm-commits mailing list